[ tor-browser ].git.dasho

regexcmp.cpp (183548B)
      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 //
      4 //  file:  regexcmp.cpp
      5 //
      6 //  Copyright (C) 2002-2016 International Business Machines Corporation and others.
      7 //  All Rights Reserved.
      8 //
      9 //  This file contains the ICU regular expression compiler, which is responsible
     10 //  for processing a regular expression pattern into the compiled form that
     11 //  is used by the match finding engine.
     12 //
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     17 
     18 #include "unicode/ustring.h"
     19 #include "unicode/unistr.h"
     20 #include "unicode/uniset.h"
     21 #include "unicode/uchar.h"
     22 #include "unicode/uchriter.h"
     23 #include "unicode/parsepos.h"
     24 #include "unicode/parseerr.h"
     25 #include "unicode/regex.h"
     26 #include "unicode/utf.h"
     27 #include "unicode/utf16.h"
     28 #include "patternprops.h"
     29 #include "putilimp.h"
     30 #include "cmemory.h"
     31 #include "cstr.h"
     32 #include "cstring.h"
     33 #include "uvectr32.h"
     34 #include "uvectr64.h"
     35 #include "uassert.h"
     36 #include "uinvchar.h"
     37 
     38 #include "regeximp.h"
     39 #include "regexcst.h"   // Contains state table for the regex pattern parser.
     40                        //   generated by a Perl script.
     41 #include "regexcmp.h"
     42 #include "regexst.h"
     43 #include "regextxt.h"
     44 
     45 
     46 
     47 U_NAMESPACE_BEGIN
     48 
     49 
     50 //------------------------------------------------------------------------------
     51 //
     52 //  Constructor.
     53 //
     54 //------------------------------------------------------------------------------
     55 RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
     56   fParenStack(status), fSetStack(uprv_deleteUObject, nullptr, status), fSetOpStack(status)
     57 {
     58    // Lazy init of all shared global sets (needed for init()'s empty text)
     59    RegexStaticSets::initGlobals(&status);
     60 
     61    fStatus           = &status;
     62 
     63    fRXPat            = rxp;
     64    fScanIndex        = 0;
     65    fLastChar         = -1;
     66    fPeekChar         = -1;
     67    fLineNum          = 1;
     68    fCharNum          = 0;
     69    fQuoteMode        = false;
     70    fInBackslashQuote = false;
     71    fModeFlags        = fRXPat->fFlags | 0x80000000;
     72    fEOLComments      = true;
     73 
     74    fMatchOpenParen   = -1;
     75    fMatchCloseParen  = -1;
     76    fCaptureName      = nullptr;
     77    fLastSetLiteral   = U_SENTINEL;
     78 
     79    if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
     80        status = rxp->fDeferredStatus;
     81    }
     82 }
     83 
     84 static const char16_t   chAmp       = 0x26;      // '&'
     85 static const char16_t   chDash      = 0x2d;      // '-'
     86 
     87 
     88 //------------------------------------------------------------------------------
     89 //
     90 //  Destructor
     91 //
     92 //------------------------------------------------------------------------------
     93 RegexCompile::~RegexCompile() {
     94    delete fCaptureName;         // Normally will be nullptr, but can exist if pattern
     95                                 //   compilation stops with a syntax error.
     96 }
     97 
     98 static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
     99    set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
    100 }
    101 
    102 //------------------------------------------------------------------------------
    103 //
    104 //  Compile regex pattern.   The state machine for rexexp pattern parsing is here.
    105 //                           The state tables are hand-written in the file regexcst.txt,
    106 //                           and converted to the form used here by a perl
    107 //                           script regexcst.pl
    108 //
    109 //------------------------------------------------------------------------------
    110 void    RegexCompile::compile(
    111                         const UnicodeString &pat,   // Source pat to be compiled.
    112                         UParseError &pp,            // Error position info
    113                         UErrorCode &e)              // Error Code
    114 {
    115    fRXPat->fPatternString = new UnicodeString(pat);
    116    UText patternText = UTEXT_INITIALIZER;
    117    utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
    118 
    119    if (U_SUCCESS(e)) {
    120        compile(&patternText, pp, e);
    121        utext_close(&patternText);
    122    }
    123 }
    124 
    125 //
    126 //   compile, UText mode
    127 //     All the work is actually done here.
    128 //
    129 void    RegexCompile::compile(
    130                         UText *pat,                 // Source pat to be compiled.
    131                         UParseError &pp,            // Error position info
    132                         UErrorCode &e)              // Error Code
    133 {
    134    fStatus             = &e;
    135    fParseErr           = &pp;
    136    fStackPtr           = 0;
    137    fStack[fStackPtr]   = 0;
    138 
    139    if (U_FAILURE(*fStatus)) {
    140        return;
    141    }
    142 
    143    // There should be no pattern stuff in the RegexPattern object.  They can not be reused.
    144    U_ASSERT(fRXPat->fPattern == nullptr || utext_nativeLength(fRXPat->fPattern) == 0);
    145 
    146    // Prepare the RegexPattern object to receive the compiled pattern.
    147    fRXPat->fPattern        = utext_clone(fRXPat->fPattern, pat, false, true, fStatus);
    148    if (U_FAILURE(*fStatus)) {
    149        return;
    150    }
    151 
    152    // Initialize the pattern scanning state machine
    153    fPatternLength = utext_nativeLength(pat);
    154    uint16_t                state = 1;
    155    const RegexTableEl      *tableEl;
    156 
    157    // UREGEX_LITERAL force entire pattern to be treated as a literal string.
    158    if (fModeFlags & UREGEX_LITERAL) {
    159        fQuoteMode = true;
    160    }
    161 
    162    nextChar(fC);                        // Fetch the first char from the pattern string.
    163 
    164    //
    165    // Main loop for the regex pattern parsing state machine.
    166    //   Runs once per state transition.
    167    //   Each time through optionally performs, depending on the state table,
    168    //      - an advance to the next pattern char
    169    //      - an action to be performed.
    170    //      - pushing or popping a state to/from the local state return stack.
    171    //   file regexcst.txt is the source for the state table.  The logic behind
    172    //     recongizing the pattern syntax is there, not here.
    173    //
    174    for (;;) {
    175        //  Bail out if anything has gone wrong.
    176        //  Regex pattern parsing stops on the first error encountered.
    177        if (U_FAILURE(*fStatus)) {
    178            break;
    179        }
    180 
    181        U_ASSERT(state != 0);
    182 
    183        // Find the state table element that matches the input char from the pattern, or the
    184        //    class of the input character.  Start with the first table row for this
    185        //    state, then linearly scan forward until we find a row that matches the
    186        //    character.  The last row for each state always matches all characters, so
    187        //    the search will stop there, if not before.
    188        //
    189        tableEl = &gRuleParseStateTable[state];
    190        REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d)    state=%s ",
    191            fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
    192 
    193        for (;;) {    // loop through table rows belonging to this state, looking for one
    194                      //   that matches the current input char.
    195            REGEX_SCAN_DEBUG_PRINTF(("."));
    196            if (tableEl->fCharClass < 127 && fC.fQuoted == false &&   tableEl->fCharClass == fC.fChar) {
    197                // Table row specified an individual character, not a set, and
    198                //   the input character is not quoted, and
    199                //   the input character matched it.
    200                break;
    201            }
    202            if (tableEl->fCharClass == 255) {
    203                // Table row specified default, match anything character class.
    204                break;
    205            }
    206            if (tableEl->fCharClass == 254 && fC.fQuoted)  {
    207                // Table row specified "quoted" and the char was quoted.
    208                break;
    209            }
    210            if (tableEl->fCharClass == 253 && fC.fChar == static_cast<UChar32>(-1)) {
    211                // Table row specified eof and we hit eof on the input.
    212                break;
    213            }
    214 
    215            if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
    216                fC.fQuoted == false &&                                       //   char is not escaped &&
    217                fC.fChar != static_cast<UChar32>(-1)) {                      //   char is not EOF
    218                U_ASSERT(tableEl->fCharClass <= 137);
    219                if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
    220                    // Table row specified a character class, or set of characters,
    221                    //   and the current char matches it.
    222                    break;
    223                }
    224            }
    225 
    226            // No match on this row, advance to the next  row for this state,
    227            tableEl++;
    228        }
    229        REGEX_SCAN_DEBUG_PRINTF(("\n"));
    230 
    231        //
    232        // We've found the row of the state table that matches the current input
    233        //   character from the rules string.
    234        // Perform any action specified  by this row in the state table.
    235        if (doParseActions(tableEl->fAction) == false) {
    236            // Break out of the state machine loop if the
    237            //   the action signalled some kind of error, or
    238            //   the action was to exit, occurs on normal end-of-rules-input.
    239            break;
    240        }
    241 
    242        if (tableEl->fPushState != 0) {
    243            fStackPtr++;
    244            if (fStackPtr >= kStackSize) {
    245                error(U_REGEX_INTERNAL_ERROR);
    246                REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
    247                fStackPtr--;
    248            }
    249            fStack[fStackPtr] = tableEl->fPushState;
    250        }
    251 
    252        //
    253        //  NextChar.  This is where characters are actually fetched from the pattern.
    254        //             Happens under control of the 'n' tag in the state table.
    255        //
    256        if (tableEl->fNextChar) {
    257            nextChar(fC);
    258        }
    259 
    260        // Get the next state from the table entry, or from the
    261        //   state stack if the next state was specified as "pop".
    262        if (tableEl->fNextState != 255) {
    263            state = tableEl->fNextState;
    264        } else {
    265            state = fStack[fStackPtr];
    266            fStackPtr--;
    267            if (fStackPtr < 0) {
    268                // state stack underflow
    269                // This will occur if the user pattern has mis-matched parentheses,
    270                //   with extra close parens.
    271                //
    272                fStackPtr++;
    273                error(U_REGEX_MISMATCHED_PAREN);
    274            }
    275        }
    276 
    277    }
    278 
    279    if (U_FAILURE(*fStatus)) {
    280        // Bail out if the pattern had errors.
    281        return;
    282    }
    283 
    284    //
    285    // The pattern has now been read and processed, and the compiled code generated.
    286    //
    287 
    288    //
    289    // The pattern's fFrameSize so far has accumulated the requirements for
    290    //   storage for capture parentheses, counters, etc. that are encountered
    291    //   in the pattern.  Add space for the two variables that are always
    292    //   present in the saved state:  the input string position (int64_t) and
    293    //   the position in the compiled pattern.
    294    //
    295    allocateStackData(RESTACKFRAME_HDRCOUNT);
    296 
    297    //
    298    // Optimization pass 1: NOPs, back-references, and case-folding
    299    //
    300    stripNOPs();
    301 
    302    //
    303    // Get bounds for the minimum and maximum length of a string that this
    304    //   pattern can match.  Used to avoid looking for matches in strings that
    305    //   are too short.
    306    //
    307    fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
    308 
    309    //
    310    // Optimization pass 2: match start type
    311    //
    312    matchStartType();
    313 
    314    //
    315    // Set up fast latin-1 range sets
    316    //
    317    int32_t numSets = fRXPat->fSets->size();
    318    fRXPat->fSets8 = new Regex8BitSet[numSets];
    319    // Null pointer check.
    320    if (fRXPat->fSets8 == nullptr) {
    321        e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
    322        return;
    323    }
    324    int32_t i;
    325    for (i=0; i<numSets; i++) {
    326        UnicodeSet* s = static_cast<UnicodeSet*>(fRXPat->fSets->elementAt(i));
    327        fRXPat->fSets8[i].init(s);
    328    }
    329 
    330 }
    331 
    332 
    333 
    334 
    335 
    336 //------------------------------------------------------------------------------
    337 //
    338 //  doParseAction        Do some action during regex pattern parsing.
    339 //                       Called by the parse state machine.
    340 //
    341 //                       Generation of the match engine PCode happens here, or
    342 //                       in functions called from the parse actions defined here.
    343 //
    344 //
    345 //------------------------------------------------------------------------------
    346 UBool RegexCompile::doParseActions(int32_t action)
    347 {
    348    UBool   returnVal = true;
    349 
    350    switch (static_cast<Regex_PatternParseAction>(action)) {
    351 
    352    case doPatStart:
    353        // Start of pattern compiles to:
    354        //0   SAVE   2        Fall back to position of FAIL
    355        //1   jmp    3
    356        //2   FAIL            Stop if we ever reach here.
    357        //3   NOP             Dummy, so start of pattern looks the same as
    358        //                    the start of an ( grouping.
    359        //4   NOP             Resreved, will be replaced by a save if there are
    360        //                    OR | operators at the top level
    361        appendOp(URX_STATE_SAVE, 2);
    362        appendOp(URX_JMP,  3);
    363        appendOp(URX_FAIL, 0);
    364 
    365        // Standard open nonCapture paren action emits the two NOPs and
    366        //   sets up the paren stack frame.
    367        doParseActions(doOpenNonCaptureParen);
    368        break;
    369 
    370    case doPatFinish:
    371        // We've scanned to the end of the pattern
    372        //  The end of pattern compiles to:
    373        //        URX_END
    374        //    which will stop the runtime match engine.
    375        //  Encountering end of pattern also behaves like a close paren,
    376        //   and forces fixups of the State Save at the beginning of the compiled pattern
    377        //   and of any OR operations at the top level.
    378        //
    379        handleCloseParen();
    380        if (fParenStack.size() > 0) {
    381            // Missing close paren in pattern.
    382            error(U_REGEX_MISMATCHED_PAREN);
    383        }
    384 
    385        // add the END operation to the compiled pattern.
    386        appendOp(URX_END, 0);
    387 
    388        // Terminate the pattern compilation state machine.
    389        returnVal = false;
    390        break;
    391 
    392 
    393 
    394    case doOrOperator:
    395        // Scanning a '|', as in (A|B)
    396        {
    397            // Generate code for any pending literals preceding the '|'
    398            fixLiterals(false);
    399 
    400            // Insert a SAVE operation at the start of the pattern section preceding
    401            //   this OR at this level.  This SAVE will branch the match forward
    402            //   to the right hand side of the OR in the event that the left hand
    403            //   side fails to match and backtracks.  Locate the position for the
    404            //   save from the location on the top of the parentheses stack.
    405            int32_t savePosition = fParenStack.popi();
    406            int32_t op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(savePosition));
    407            U_ASSERT(URX_TYPE(op) == URX_NOP);  // original contents of reserved location
    408            op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
    409            fRXPat->fCompiledPat->setElementAt(op, savePosition);
    410 
    411            // Append an JMP operation into the compiled pattern.  The operand for
    412            //  the JMP will eventually be the location following the ')' for the
    413            //  group.  This will be patched in later, when the ')' is encountered.
    414            appendOp(URX_JMP, 0);
    415 
    416            // Push the position of the newly added JMP op onto the parentheses stack.
    417            // This registers if for fixup when this block's close paren is encountered.
    418            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
    419 
    420            // Append a NOP to the compiled pattern.  This is the slot reserved
    421            //   for a SAVE in the event that there is yet another '|' following
    422            //   this one.
    423            appendOp(URX_NOP, 0);
    424            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
    425        }
    426        break;
    427 
    428 
    429    case doBeginNamedCapture:
    430        // Scanning (?<letter.
    431        //   The first letter of the name will come through again under doConinueNamedCapture.
    432        fCaptureName = new UnicodeString();
    433        if (fCaptureName == nullptr) {
    434            error(U_MEMORY_ALLOCATION_ERROR);
    435        }
    436        break;
    437 
    438    case  doContinueNamedCapture:
    439        fCaptureName->append(fC.fChar);
    440        break;
    441 
    442    case doBadNamedCapture:
    443        error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
    444        break;
    445        
    446    case doOpenCaptureParen:
    447        // Open Capturing Paren, possibly named.
    448        //   Compile to a
    449        //      - NOP, which later may be replaced by a save-state if the
    450        //         parenthesized group gets a * quantifier, followed by
    451        //      - START_CAPTURE  n    where n is stack frame offset to the capture group variables.
    452        //      - NOP, which may later be replaced by a save-state if there
    453        //             is an '|' alternation within the parens.
    454        //
    455        //    Each capture group gets three slots in the save stack frame:
    456        //         0: Capture Group start position (in input string being matched.)
    457        //         1: Capture Group end position.
    458        //         2: Start of Match-in-progress.
    459        //    The first two locations are for a completed capture group, and are
    460        //     referred to by back references and the like.
    461        //    The third location stores the capture start position when an START_CAPTURE is
    462        //      encountered.  This will be promoted to a completed capture when (and if) the corresponding
    463        //      END_CAPTURE is encountered.
    464        {
    465            fixLiterals();
    466            appendOp(URX_NOP, 0);
    467            int32_t  varsLoc = allocateStackData(3);    // Reserve three slots in match stack frame.
    468            appendOp(URX_START_CAPTURE, varsLoc);
    469            appendOp(URX_NOP, 0);
    470 
    471            // On the Parentheses stack, start a new frame and add the positions
    472            //   of the two NOPs.  Depending on what follows in the pattern, the
    473            //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
    474            //   address of the end of the parenthesized group.
    475            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    476            fParenStack.push(capturing, *fStatus);                        // Frame type.
    477            fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first  NOP location
    478            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
    479 
    480            // Save the mapping from group number to stack frame variable position.
    481            fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
    482 
    483            // If this is a named capture group, add the name->group number mapping.
    484            if (fCaptureName != nullptr) {
    485                if (!fRXPat->initNamedCaptureMap()) {
    486                    if (U_SUCCESS(*fStatus)) {
    487                        error(fRXPat->fDeferredStatus);
    488                    }
    489                    break;
    490                }
    491                int32_t groupNumber = fRXPat->fGroupMap->size();
    492                int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
    493                fCaptureName = nullptr;    // hash table takes ownership of the name (key) string.
    494                if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
    495                    error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
    496                }
    497            }
    498        }
    499        break;
    500 
    501    case doOpenNonCaptureParen:
    502        // Open non-caputuring (grouping only) Paren.
    503        //   Compile to a
    504        //      - NOP, which later may be replaced by a save-state if the
    505        //         parenthesized group gets a * quantifier, followed by
    506        //      - NOP, which may later be replaced by a save-state if there
    507        //             is an '|' alternation within the parens.
    508        {
    509            fixLiterals();
    510            appendOp(URX_NOP, 0);
    511            appendOp(URX_NOP, 0);
    512 
    513            // On the Parentheses stack, start a new frame and add the positions
    514            //   of the two NOPs.
    515            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    516            fParenStack.push(plain,      *fStatus);                       // Begin a new frame.
    517            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
    518            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
    519        }
    520         break;
    521 
    522 
    523    case doOpenAtomicParen:
    524        // Open Atomic Paren.  (?>
    525        //   Compile to a
    526        //      - NOP, which later may be replaced if the parenthesized group
    527        //         has a quantifier, followed by
    528        //      - STO_SP  save state stack position, so it can be restored at the ")"
    529        //      - NOP, which may later be replaced by a save-state if there
    530        //             is an '|' alternation within the parens.
    531        {
    532            fixLiterals();
    533            appendOp(URX_NOP, 0);
    534            int32_t  varLoc = allocateData(1);    // Reserve a data location for saving the state stack ptr.
    535            appendOp(URX_STO_SP, varLoc);
    536            appendOp(URX_NOP, 0);
    537 
    538            // On the Parentheses stack, start a new frame and add the positions
    539            //   of the two NOPs.  Depending on what follows in the pattern, the
    540            //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
    541            //   address of the end of the parenthesized group.
    542            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    543            fParenStack.push(atomic, *fStatus);                           // Frame type.
    544            fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first NOP
    545            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
    546        }
    547        break;
    548 
    549 
    550    case doOpenLookAhead:
    551        // Positive Look-ahead   (?=  stuff  )
    552        //
    553        //   Note:   Addition of transparent input regions, with the need to
    554        //           restore the original regions when failing out of a lookahead
    555        //           block, complicated this sequence.  Some combined opcodes
    556        //           might make sense - or might not, lookahead aren't that common.
    557        //
    558        //      Caution:  min match length optimization knows about this
    559        //               sequence; don't change without making updates there too.
    560        //
    561        // Compiles to
    562        //    1    LA_START     dataLoc     Saves SP, Input Pos, Active input region.
    563        //    2.   STATE_SAVE   4            on failure of lookahead, goto 4
    564        //    3    JMP          6           continue ...
    565        //
    566        //    4.   LA_END                   Look Ahead failed.  Restore regions.
    567        //    5.   BACKTRACK                and back track again.
    568        //
    569        //    6.   NOP              reserved for use by quantifiers on the block.
    570        //                          Look-ahead can't have quantifiers, but paren stack
    571        //                             compile time conventions require the slot anyhow.
    572        //    7.   NOP              may be replaced if there is are '|' ops in the block.
    573        //    8.     code for parenthesized stuff.
    574        //    9.   LA_END
    575        //
    576        //  Four data slots are reserved, for saving state on entry to the look-around
    577        //    0:   stack pointer on entry.
    578        //    1:   input position on entry.
    579        //    2:   fActiveStart, the active bounds start on entry.
    580        //    3:   fActiveLimit, the active bounds limit on entry.
    581        {
    582            fixLiterals();
    583            int32_t dataLoc = allocateData(4);
    584            appendOp(URX_LA_START, dataLoc);
    585            appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
    586            appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
    587            appendOp(URX_LA_END, dataLoc);
    588            appendOp(URX_BACKTRACK, 0);
    589            appendOp(URX_NOP, 0);
    590            appendOp(URX_NOP, 0);
    591 
    592            // On the Parentheses stack, start a new frame and add the positions
    593            //   of the NOPs.
    594            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    595            fParenStack.push(lookAhead, *fStatus);                        // Frame type.
    596            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
    597            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
    598        }
    599        break;
    600 
    601    case doOpenLookAheadNeg:
    602        // Negated Lookahead.   (?! stuff )
    603        // Compiles to
    604        //    1.    LA_START    dataloc
    605        //    2.    SAVE_STATE  7         // Fail within look-ahead block restores to this state,
    606        //                                //   which continues with the match.
    607        //    3.    NOP                   // Std. Open Paren sequence, for possible '|'
    608        //    4.       code for parenthesized stuff.
    609        //    5.    LA_END                // Cut back stack, remove saved state from step 2.
    610        //    6.    BACKTRACK             // code in block succeeded, so neg. lookahead fails.
    611        //    7.    END_LA                // Restore match region, in case look-ahead was using
    612        //                                        an alternate (transparent) region.
    613        //  Four data slots are reserved, for saving state on entry to the look-around
    614        //    0:   stack pointer on entry.
    615        //    1:   input position on entry.
    616        //    2:   fActiveStart, the active bounds start on entry.
    617        //    3:   fActiveLimit, the active bounds limit on entry.
    618        {
    619            fixLiterals();
    620            int32_t dataLoc = allocateData(4);
    621            appendOp(URX_LA_START, dataLoc);
    622            appendOp(URX_STATE_SAVE, 0);    // dest address will be patched later.
    623            appendOp(URX_NOP, 0);
    624 
    625            // On the Parentheses stack, start a new frame and add the positions
    626            //   of the StateSave and NOP.
    627            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    628            fParenStack.push(negLookAhead, *fStatus);                    // Frame type
    629            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The STATE_SAVE location
    630            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
    631 
    632            // Instructions #5 - #7 will be added when the ')' is encountered.
    633        }
    634        break;
    635 
    636    case doOpenLookBehind:
    637        {
    638            //   Compile a (?<= look-behind open paren.
    639            //
    640            //          Compiles to
    641            //              0       URX_LB_START     dataLoc
    642            //              1       URX_LB_CONT      dataLoc
    643            //              2                        MinMatchLen
    644            //              3                        MaxMatchLen
    645            //              4       URX_NOP          Standard '(' boilerplate.
    646            //              5       URX_NOP          Reserved slot for use with '|' ops within (block).
    647            //              6         <code for LookBehind expression>
    648            //              7       URX_LB_END       dataLoc    # Check match len, restore input  len
    649            //              8       URX_LA_END       dataLoc    # Restore stack, input pos
    650            //
    651            //          Allocate a block of matcher data, to contain (when running a match)
    652            //              0:    Stack ptr on entry
    653            //              1:    Input Index on entry
    654            //              2:    fActiveStart, the active bounds start on entry.
    655            //              3:    fActiveLimit, the active bounds limit on entry.
    656            //              4:    Start index of match current match attempt.
    657            //          The first four items must match the layout of data for LA_START / LA_END
    658 
    659            // Generate match code for any pending literals.
    660            fixLiterals();
    661 
    662            // Allocate data space
    663            int32_t dataLoc = allocateData(5);
    664 
    665            // Emit URX_LB_START
    666            appendOp(URX_LB_START, dataLoc);
    667 
    668            // Emit URX_LB_CONT
    669            appendOp(URX_LB_CONT, dataLoc);
    670            appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
    671            appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
    672 
    673            // Emit the NOPs
    674            appendOp(URX_NOP, 0);
    675            appendOp(URX_NOP, 0);
    676 
    677            // On the Parentheses stack, start a new frame and add the positions
    678            //   of the URX_LB_CONT and the NOP.
    679            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    680            fParenStack.push(lookBehind, *fStatus);                       // Frame type
    681            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
    682            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
    683 
    684            // The final two instructions will be added when the ')' is encountered.
    685        }
    686 
    687        break;
    688 
    689    case doOpenLookBehindNeg:
    690        {
    691            //   Compile a (?<! negated look-behind open paren.
    692            //
    693            //          Compiles to
    694            //              0       URX_LB_START     dataLoc    # Save entry stack, input len
    695            //              1       URX_LBN_CONT     dataLoc    # Iterate possible match positions
    696            //              2                        MinMatchLen
    697            //              3                        MaxMatchLen
    698            //              4                        continueLoc (9)
    699            //              5       URX_NOP          Standard '(' boilerplate.
    700            //              6       URX_NOP          Reserved slot for use with '|' ops within (block).
    701            //              7         <code for LookBehind expression>
    702            //              8       URX_LBN_END      dataLoc    # Check match len, cause a FAIL
    703            //              9       ...
    704            //
    705            //          Allocate a block of matcher data, to contain (when running a match)
    706            //              0:    Stack ptr on entry
    707            //              1:    Input Index on entry
    708            //              2:    fActiveStart, the active bounds start on entry.
    709            //              3:    fActiveLimit, the active bounds limit on entry.
    710            //              4:    Start index of match current match attempt.
    711            //          The first four items must match the layout of data for LA_START / LA_END
    712 
    713            // Generate match code for any pending literals.
    714            fixLiterals();
    715 
    716            // Allocate data space
    717            int32_t dataLoc = allocateData(5);
    718 
    719            // Emit URX_LB_START
    720            appendOp(URX_LB_START, dataLoc);
    721 
    722            // Emit URX_LBN_CONT
    723            appendOp(URX_LBN_CONT, dataLoc);
    724            appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
    725            appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
    726            appendOp(URX_RESERVED_OP, 0);    // Continue Loc.    To be filled later.
    727 
    728            // Emit the NOPs
    729            appendOp(URX_NOP, 0);
    730            appendOp(URX_NOP, 0);
    731 
    732            // On the Parentheses stack, start a new frame and add the positions
    733            //   of the URX_LB_CONT and the NOP.
    734            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
    735            fParenStack.push(lookBehindN, *fStatus);                      // Frame type
    736            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
    737            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
    738 
    739            // The final two instructions will be added when the ')' is encountered.
    740        }
    741        break;
    742 
    743    case doConditionalExpr:
    744        // Conditionals such as (?(1)a:b)
    745    case doPerlInline:
    746        // Perl inline-conditionals.  (?{perl code}a|b) We're not perl, no way to do them.
    747        error(U_REGEX_UNIMPLEMENTED);
    748        break;
    749 
    750 
    751    case doCloseParen:
    752        handleCloseParen();
    753        if (fParenStack.size() <= 0) {
    754            //  Extra close paren, or missing open paren.
    755            error(U_REGEX_MISMATCHED_PAREN);
    756        }
    757        break;
    758 
    759    case doNOP:
    760        break;
    761 
    762 
    763    case doBadOpenParenType:
    764    case doRuleError:
    765        error(U_REGEX_RULE_SYNTAX);
    766        break;
    767 
    768 
    769    case doMismatchedParenErr:
    770        error(U_REGEX_MISMATCHED_PAREN);
    771        break;
    772 
    773    case doPlus:
    774        //  Normal '+'  compiles to
    775        //     1.   stuff to be repeated  (already built)
    776        //     2.   jmp-sav 1
    777        //     3.   ...
    778        //
    779        //  Or, if the item to be repeated can match a zero length string,
    780        //     1.   STO_INP_LOC  data-loc
    781        //     2.      body of stuff to be repeated
    782        //     3.   JMP_SAV_X    2
    783        //     4.   ...
    784 
    785        //
    786        //  Or, if the item to be repeated is simple
    787        //     1.   Item to be repeated.
    788        //     2.   LOOP_SR_I    set number  (assuming repeated item is a set ref)
    789        //     3.   LOOP_C       stack location
    790        {
    791            int32_t  topLoc = blockTopLoc(false);        // location of item #1
    792            int32_t  frameLoc;
    793 
    794            // Check for simple constructs, which may get special optimized code.
    795            if (topLoc == fRXPat->fCompiledPat->size() - 1) {
    796                int32_t repeatedOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(topLoc));
    797 
    798                if (URX_TYPE(repeatedOp) == URX_SETREF) {
    799                    // Emit optimized code for [char set]+
    800                    appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
    801                    frameLoc = allocateStackData(1);
    802                    appendOp(URX_LOOP_C, frameLoc);
    803                    break;
    804                }
    805 
    806                if (URX_TYPE(repeatedOp) == URX_DOTANY ||
    807                    URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
    808                    URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
    809                    // Emit Optimized code for .+ operations.
    810                    int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
    811                    if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
    812                        // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
    813                        loopOpI |= 1;
    814                    }
    815                    if (fModeFlags & UREGEX_UNIX_LINES) {
    816                        loopOpI |= 2;
    817                    }
    818                    appendOp(loopOpI);
    819                    frameLoc = allocateStackData(1);
    820                    appendOp(URX_LOOP_C, frameLoc);
    821                    break;
    822                }
    823 
    824            }
    825 
    826            // General case.
    827 
    828            // Check for minimum match length of zero, which requires
    829            //    extra loop-breaking code.
    830            if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
    831                // Zero length match is possible.
    832                // Emit the code sequence that can handle it.
    833                insertOp(topLoc);
    834                frameLoc = allocateStackData(1);
    835 
    836                int32_t op = buildOp(URX_STO_INP_LOC, frameLoc);
    837                fRXPat->fCompiledPat->setElementAt(op, topLoc);
    838 
    839                appendOp(URX_JMP_SAV_X, topLoc+1);
    840            } else {
    841                // Simpler code when the repeated body must match something non-empty
    842                appendOp(URX_JMP_SAV, topLoc);
    843            }
    844        }
    845        break;
    846 
    847    case doNGPlus:
    848        //  Non-greedy '+?'  compiles to
    849        //     1.   stuff to be repeated  (already built)
    850        //     2.   state-save  1
    851        //     3.   ...
    852        {
    853            int32_t topLoc      = blockTopLoc(false);
    854            appendOp(URX_STATE_SAVE, topLoc);
    855        }
    856        break;
    857 
    858 
    859    case doOpt:
    860        // Normal (greedy) ? quantifier.
    861        //  Compiles to
    862        //     1. state save 3
    863        //     2.    body of optional block
    864        //     3. ...
    865        // Insert the state save into the compiled pattern, and we're done.
    866        {
    867            int32_t   saveStateLoc = blockTopLoc(true);
    868            int32_t   saveStateOp  = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
    869            fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
    870        }
    871        break;
    872 
    873    case doNGOpt:
    874        // Non-greedy ?? quantifier
    875        //   compiles to
    876        //    1.  jmp   4
    877        //    2.     body of optional block
    878        //    3   jmp   5
    879        //    4.  state save 2
    880        //    5    ...
    881        //  This code is less than ideal, with two jmps instead of one, because we can only
    882        //  insert one instruction at the top of the block being iterated.
    883        {
    884            int32_t  jmp1_loc = blockTopLoc(true);
    885            int32_t  jmp2_loc = fRXPat->fCompiledPat->size();
    886 
    887            int32_t  jmp1_op  = buildOp(URX_JMP, jmp2_loc+1);
    888            fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
    889 
    890            appendOp(URX_JMP, jmp2_loc+2);
    891 
    892            appendOp(URX_STATE_SAVE, jmp1_loc+1);
    893        }
    894        break;
    895 
    896 
    897    case doStar:
    898        // Normal (greedy) * quantifier.
    899        // Compiles to
    900        //       1.   STATE_SAVE   4
    901        //       2.      body of stuff being iterated over
    902        //       3.   JMP_SAV      2
    903        //       4.   ...
    904        //
    905        // Or, if the body is a simple [Set],
    906        //       1.   LOOP_SR_I    set number
    907        //       2.   LOOP_C       stack location
    908        //       ...
    909        //
    910        // Or if this is a .*
    911        //       1.   LOOP_DOT_I    (. matches all mode flag)
    912        //       2.   LOOP_C        stack location
    913        //
    914        // Or, if the body can match a zero-length string, to inhibit infinite loops,
    915        //       1.   STATE_SAVE   5
    916        //       2.   STO_INP_LOC  data-loc
    917        //       3.      body of stuff
    918        //       4.   JMP_SAV_X    2
    919        //       5.   ...
    920        {
    921            // location of item #1, the STATE_SAVE
    922            int32_t   topLoc = blockTopLoc(false);
    923            int32_t   dataLoc = -1;
    924 
    925            // Check for simple *, where the construct being repeated
    926            //   compiled to single opcode, and might be optimizable.
    927            if (topLoc == fRXPat->fCompiledPat->size() - 1) {
    928                int32_t repeatedOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(topLoc));
    929 
    930                if (URX_TYPE(repeatedOp) == URX_SETREF) {
    931                    // Emit optimized code for a [char set]*
    932                    int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
    933                    fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
    934                    dataLoc = allocateStackData(1);
    935                    appendOp(URX_LOOP_C, dataLoc);
    936                    break;
    937                }
    938 
    939                if (URX_TYPE(repeatedOp) == URX_DOTANY ||
    940                    URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
    941                    URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
    942                    // Emit Optimized code for .* operations.
    943                    int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
    944                    if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
    945                        // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
    946                        loopOpI |= 1;
    947                    }
    948                    if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
    949                        loopOpI |= 2;
    950                    }
    951                    fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
    952                    dataLoc = allocateStackData(1);
    953                    appendOp(URX_LOOP_C, dataLoc);
    954                    break;
    955                }
    956            }
    957 
    958            // Emit general case code for this *
    959            // The optimizations did not apply.
    960 
    961            int32_t   saveStateLoc = blockTopLoc(true);
    962            int32_t   jmpOp        = buildOp(URX_JMP_SAV, saveStateLoc+1);
    963 
    964            // Check for minimum match length of zero, which requires
    965            //    extra loop-breaking code.
    966            if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
    967                insertOp(saveStateLoc);
    968                dataLoc = allocateStackData(1);
    969 
    970                int32_t op = buildOp(URX_STO_INP_LOC, dataLoc);
    971                fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
    972                jmpOp      = buildOp(URX_JMP_SAV_X, saveStateLoc+2);
    973            }
    974 
    975            // Locate the position in the compiled pattern where the match will continue
    976            //   after completing the *.   (4 or 5 in the comment above)
    977            int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
    978 
    979            // Put together the save state op and store it into the compiled code.
    980            int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc);
    981            fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
    982 
    983            // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
    984            appendOp(jmpOp);
    985        }
    986        break;
    987 
    988    case doNGStar:
    989        // Non-greedy *? quantifier
    990        // compiles to
    991        //     1.   JMP    3
    992        //     2.      body of stuff being iterated over
    993        //     3.   STATE_SAVE  2
    994        //     4    ...
    995        {
    996            int32_t     jmpLoc  = blockTopLoc(true);                   // loc  1.
    997            int32_t     saveLoc = fRXPat->fCompiledPat->size();        // loc  3.
    998            int32_t     jmpOp   = buildOp(URX_JMP, saveLoc);
    999            fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
   1000            appendOp(URX_STATE_SAVE, jmpLoc+1);
   1001        }
   1002        break;
   1003 
   1004 
   1005    case doIntervalInit:
   1006        // The '{' opening an interval quantifier was just scanned.
   1007        // Init the counter variables that will accumulate the values as the digits
   1008        //    are scanned.
   1009        fIntervalLow = 0;
   1010        fIntervalUpper = -1;
   1011        break;
   1012 
   1013    case doIntevalLowerDigit:
   1014        // Scanned a digit from the lower value of an {lower,upper} interval
   1015        {
   1016            int32_t digitValue = u_charDigitValue(fC.fChar);
   1017            U_ASSERT(digitValue >= 0);
   1018            int64_t val = static_cast<int64_t>(fIntervalLow) * 10 + digitValue;
   1019            if (val > INT32_MAX) {
   1020                error(U_REGEX_NUMBER_TOO_BIG);
   1021            } else {
   1022                fIntervalLow = static_cast<int32_t>(val);
   1023            }
   1024        }
   1025        break;
   1026 
   1027    case doIntervalUpperDigit:
   1028        // Scanned a digit from the upper value of an {lower,upper} interval
   1029        {
   1030            if (fIntervalUpper < 0) {
   1031                fIntervalUpper = 0;
   1032            }
   1033            int32_t digitValue = u_charDigitValue(fC.fChar);
   1034            U_ASSERT(digitValue >= 0);
   1035            int64_t val = static_cast<int64_t>(fIntervalUpper) * 10 + digitValue;
   1036            if (val > INT32_MAX) {
   1037                error(U_REGEX_NUMBER_TOO_BIG);
   1038            } else {
   1039                fIntervalUpper = static_cast<int32_t>(val);
   1040            }
   1041        }
   1042        break;
   1043 
   1044    case doIntervalSame:
   1045        // Scanned a single value interval like {27}.  Upper = Lower.
   1046        fIntervalUpper = fIntervalLow;
   1047        break;
   1048 
   1049    case doInterval:
   1050        // Finished scanning a normal {lower,upper} interval.  Generate the code for it.
   1051        if (compileInlineInterval() == false) {
   1052            compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
   1053        }
   1054        break;
   1055 
   1056    case doPossessiveInterval:
   1057        // Finished scanning a Possessive {lower,upper}+ interval.  Generate the code for it.
   1058        {
   1059            // Remember the loc for the top of the block being looped over.
   1060            //   (Can not reserve a slot in the compiled pattern at this time, because
   1061            //    compileInterval needs to reserve also, and blockTopLoc can only reserve
   1062            //    once per block.)
   1063            int32_t topLoc = blockTopLoc(false);
   1064 
   1065            // Produce normal looping code.
   1066            compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
   1067 
   1068            // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
   1069            //  just as if the loop was inclosed in atomic parentheses.
   1070 
   1071            // First the STO_SP before the start of the loop
   1072            insertOp(topLoc);
   1073 
   1074            int32_t  varLoc = allocateData(1);   // Reserve a data location for saving the
   1075            int32_t  op     = buildOp(URX_STO_SP, varLoc);
   1076            fRXPat->fCompiledPat->setElementAt(op, topLoc);
   1077 
   1078            int32_t loopOp = static_cast<int32_t>(fRXPat->fCompiledPat->popi());
   1079            U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
   1080            loopOp++;     // point LoopOp after the just-inserted STO_SP
   1081            fRXPat->fCompiledPat->push(loopOp, *fStatus);
   1082 
   1083            // Then the LD_SP after the end of the loop
   1084            appendOp(URX_LD_SP, varLoc);
   1085        }
   1086 
   1087        break;
   1088 
   1089    case doNGInterval:
   1090        // Finished scanning a non-greedy {lower,upper}? interval.  Generate the code for it.
   1091        compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
   1092        break;
   1093 
   1094    case doIntervalError:
   1095        error(U_REGEX_BAD_INTERVAL);
   1096        break;
   1097 
   1098    case doLiteralChar:
   1099        // We've just scanned a "normal" character from the pattern,
   1100        literalChar(fC.fChar);
   1101        break;
   1102 
   1103 
   1104    case doEscapedLiteralChar:
   1105        // We've just scanned an backslashed escaped character with  no
   1106        //   special meaning.  It represents itself.
   1107        if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
   1108            ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
   1109            (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
   1110               error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   1111             }
   1112        literalChar(fC.fChar);
   1113        break;
   1114 
   1115 
   1116    case doDotAny:
   1117        // scanned a ".",  match any single character.
   1118        {
   1119            fixLiterals(false);
   1120            if (fModeFlags & UREGEX_DOTALL) {
   1121                appendOp(URX_DOTANY_ALL, 0);
   1122            } else if (fModeFlags & UREGEX_UNIX_LINES) {
   1123                appendOp(URX_DOTANY_UNIX, 0);
   1124            } else {
   1125                appendOp(URX_DOTANY, 0);
   1126            }
   1127        }
   1128        break;
   1129 
   1130    case doCaret:
   1131        {
   1132            fixLiterals(false);
   1133            if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
   1134                appendOp(URX_CARET, 0);
   1135            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
   1136                appendOp(URX_CARET_M, 0);
   1137            } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
   1138                appendOp(URX_CARET, 0);   // Only testing true start of input.
   1139            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
   1140                appendOp(URX_CARET_M_UNIX, 0);
   1141            }
   1142        }
   1143        break;
   1144 
   1145    case doDollar:
   1146        {
   1147            fixLiterals(false);
   1148            if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
   1149                appendOp(URX_DOLLAR, 0);
   1150            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
   1151                appendOp(URX_DOLLAR_M, 0);
   1152            } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
   1153                appendOp(URX_DOLLAR_D, 0);
   1154            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
   1155                appendOp(URX_DOLLAR_MD, 0);
   1156            }
   1157        }
   1158        break;
   1159 
   1160    case doBackslashA:
   1161        fixLiterals(false);
   1162        appendOp(URX_CARET, 0);
   1163        break;
   1164 
   1165    case doBackslashB:
   1166        {
   1167            #if  UCONFIG_NO_BREAK_ITERATION==1
   1168            if (fModeFlags & UREGEX_UWORD) {
   1169                error(U_UNSUPPORTED_ERROR);
   1170            }
   1171            #endif
   1172            fixLiterals(false);
   1173            int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
   1174            appendOp(op, 1);
   1175        }
   1176        break;
   1177 
   1178    case doBackslashb:
   1179        {
   1180            #if  UCONFIG_NO_BREAK_ITERATION==1
   1181            if (fModeFlags & UREGEX_UWORD) {
   1182                error(U_UNSUPPORTED_ERROR);
   1183            }
   1184            #endif
   1185            fixLiterals(false);
   1186            int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
   1187            appendOp(op, 0);
   1188        }
   1189        break;
   1190 
   1191    case doBackslashD:
   1192        fixLiterals(false);
   1193        appendOp(URX_BACKSLASH_D, 1);
   1194        break;
   1195 
   1196    case doBackslashd:
   1197        fixLiterals(false);
   1198        appendOp(URX_BACKSLASH_D, 0);
   1199        break;
   1200 
   1201    case doBackslashG:
   1202        fixLiterals(false);
   1203        appendOp(URX_BACKSLASH_G, 0);
   1204        break;
   1205 
   1206    case doBackslashH:
   1207        fixLiterals(false);
   1208        appendOp(URX_BACKSLASH_H, 1);
   1209        break;
   1210 
   1211    case doBackslashh:
   1212        fixLiterals(false);
   1213        appendOp(URX_BACKSLASH_H, 0);
   1214        break;
   1215 
   1216    case doBackslashR:
   1217        fixLiterals(false);
   1218        appendOp(URX_BACKSLASH_R, 0);
   1219        break;
   1220 
   1221    case doBackslashS:
   1222        fixLiterals(false);
   1223        appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
   1224        break;
   1225 
   1226    case doBackslashs:
   1227        fixLiterals(false);
   1228        appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
   1229        break;
   1230 
   1231    case doBackslashV:
   1232        fixLiterals(false);
   1233        appendOp(URX_BACKSLASH_V, 1);
   1234        break;
   1235 
   1236    case doBackslashv:
   1237        fixLiterals(false);
   1238        appendOp(URX_BACKSLASH_V, 0);
   1239        break;
   1240 
   1241    case doBackslashW:
   1242        fixLiterals(false);
   1243        appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
   1244        break;
   1245 
   1246    case doBackslashw:
   1247        fixLiterals(false);
   1248        appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
   1249        break;
   1250 
   1251    case doBackslashX:
   1252        #if  UCONFIG_NO_BREAK_ITERATION==1
   1253        // Grapheme Cluster Boundary requires ICU break iteration.
   1254        error(U_UNSUPPORTED_ERROR);
   1255        #endif
   1256        fixLiterals(false);
   1257        appendOp(URX_BACKSLASH_X, 0);
   1258        break;
   1259 
   1260    case doBackslashZ:
   1261        fixLiterals(false);
   1262        appendOp(URX_DOLLAR, 0);
   1263        break;
   1264 
   1265    case doBackslashz:
   1266        fixLiterals(false);
   1267        appendOp(URX_BACKSLASH_Z, 0);
   1268        break;
   1269 
   1270    case doEscapeError:
   1271        error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   1272        break;
   1273 
   1274    case doExit:
   1275        fixLiterals(false);
   1276        returnVal = false;
   1277        break;
   1278 
   1279    case doProperty:
   1280        {
   1281            fixLiterals(false);
   1282            UnicodeSet *theSet = scanProp();
   1283            compileSet(theSet);
   1284        }
   1285        break;
   1286 
   1287    case doNamedChar:
   1288        {
   1289            UChar32 c = scanNamedChar();
   1290            literalChar(c);
   1291        }
   1292        break;
   1293 
   1294 
   1295    case doBackRef:
   1296        // BackReference.  Somewhat unusual in that the front-end can not completely parse
   1297        //                 the regular expression, because the number of digits to be consumed
   1298        //                 depends on the number of capture groups that have been defined.  So
   1299        //                 we have to do it here instead.
   1300        {
   1301            int32_t  numCaptureGroups = fRXPat->fGroupMap->size();
   1302            int32_t  groupNum = 0;
   1303            UChar32  c        = fC.fChar;
   1304 
   1305            for (;;) {
   1306                // Loop once per digit, for max allowed number of digits in a back reference.
   1307                int32_t digit = u_charDigitValue(c);
   1308                groupNum = groupNum * 10 + digit;
   1309                if (groupNum >= numCaptureGroups) {
   1310                    break;
   1311                }
   1312                c = peekCharLL();
   1313                if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == false) {
   1314                    break;
   1315                }
   1316                nextCharLL();
   1317            }
   1318 
   1319            // Scan of the back reference in the source regexp is complete.  Now generate
   1320            //  the compiled code for it.
   1321            // Because capture groups can be forward-referenced by back-references,
   1322            //  we fill the operand with the capture group number.  At the end
   1323            //  of compilation, it will be changed to the variable's location.
   1324            U_ASSERT(groupNum > 0);  // Shouldn't happen.  '\0' begins an octal escape sequence,
   1325                                     //    and shouldn't enter this code path at all.
   1326            fixLiterals(false);
   1327            if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
   1328                appendOp(URX_BACKREF_I, groupNum);
   1329            } else {
   1330                appendOp(URX_BACKREF, groupNum);
   1331            }
   1332        }
   1333        break;
   1334 
   1335    case doBeginNamedBackRef:
   1336        U_ASSERT(fCaptureName == nullptr);
   1337        fCaptureName = new UnicodeString;
   1338        if (fCaptureName == nullptr) {
   1339            error(U_MEMORY_ALLOCATION_ERROR);
   1340        }
   1341        break;
   1342            
   1343    case doContinueNamedBackRef:
   1344        fCaptureName->append(fC.fChar);
   1345        break;
   1346 
   1347    case doCompleteNamedBackRef:
   1348        {
   1349        int32_t groupNumber =
   1350            fRXPat->fNamedCaptureMap ? uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName) : 0;
   1351        if (groupNumber == 0) {
   1352            // Group name has not been defined.
   1353            //   Could be a forward reference. If we choose to support them at some
   1354            //   future time, extra mechanism will be required at this point.
   1355            error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   1356        } else {
   1357            // Given the number, handle identically to a \n numbered back reference.
   1358            // See comments above, under doBackRef
   1359            fixLiterals(false);
   1360            if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
   1361                appendOp(URX_BACKREF_I, groupNumber);
   1362            } else {
   1363                appendOp(URX_BACKREF, groupNumber);
   1364            }
   1365        }
   1366        delete fCaptureName;
   1367        fCaptureName = nullptr;
   1368        break;
   1369        }
   1370       
   1371    case doPossessivePlus:
   1372        // Possessive ++ quantifier.
   1373        // Compiles to
   1374        //       1.   STO_SP
   1375        //       2.      body of stuff being iterated over
   1376        //       3.   STATE_SAVE 5
   1377        //       4.   JMP        2
   1378        //       5.   LD_SP
   1379        //       6.   ...
   1380        //
   1381        //  Note:  TODO:  This is pretty inefficient.  A mass of saved state is built up
   1382        //                then unconditionally discarded.  Perhaps introduce a new opcode.  Ticket 6056
   1383        //
   1384        {
   1385            // Emit the STO_SP
   1386            int32_t   topLoc = blockTopLoc(true);
   1387            int32_t   stoLoc = allocateData(1);  // Reserve the data location for storing save stack ptr.
   1388            int32_t   op     = buildOp(URX_STO_SP, stoLoc);
   1389            fRXPat->fCompiledPat->setElementAt(op, topLoc);
   1390 
   1391            // Emit the STATE_SAVE
   1392            appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
   1393 
   1394            // Emit the JMP
   1395            appendOp(URX_JMP, topLoc+1);
   1396 
   1397            // Emit the LD_SP
   1398            appendOp(URX_LD_SP, stoLoc);
   1399        }
   1400        break;
   1401 
   1402    case doPossessiveStar:
   1403        // Possessive *+ quantifier.
   1404        // Compiles to
   1405        //       1.   STO_SP       loc
   1406        //       2.   STATE_SAVE   5
   1407        //       3.      body of stuff being iterated over
   1408        //       4.   JMP          2
   1409        //       5.   LD_SP        loc
   1410        //       6    ...
   1411        // TODO:  do something to cut back the state stack each time through the loop.
   1412        {
   1413            // Reserve two slots at the top of the block.
   1414            int32_t   topLoc = blockTopLoc(true);
   1415            insertOp(topLoc);
   1416 
   1417            // emit   STO_SP     loc
   1418            int32_t   stoLoc = allocateData(1);    // Reserve the data location for storing save stack ptr.
   1419            int32_t   op     = buildOp(URX_STO_SP, stoLoc);
   1420            fRXPat->fCompiledPat->setElementAt(op, topLoc);
   1421 
   1422            // Emit the SAVE_STATE   5
   1423            int32_t L7 = fRXPat->fCompiledPat->size()+1;
   1424            op = buildOp(URX_STATE_SAVE, L7);
   1425            fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
   1426 
   1427            // Append the JMP operation.
   1428            appendOp(URX_JMP, topLoc+1);
   1429 
   1430            // Emit the LD_SP       loc
   1431            appendOp(URX_LD_SP, stoLoc);
   1432        }
   1433        break;
   1434 
   1435    case doPossessiveOpt:
   1436        // Possessive  ?+ quantifier.
   1437        //  Compiles to
   1438        //     1. STO_SP      loc
   1439        //     2. SAVE_STATE  5
   1440        //     3.    body of optional block
   1441        //     4. LD_SP       loc
   1442        //     5. ...
   1443        //
   1444        {
   1445            // Reserve two slots at the top of the block.
   1446            int32_t   topLoc = blockTopLoc(true);
   1447            insertOp(topLoc);
   1448 
   1449            // Emit the STO_SP
   1450            int32_t   stoLoc = allocateData(1);   // Reserve the data location for storing save stack ptr.
   1451            int32_t   op     = buildOp(URX_STO_SP, stoLoc);
   1452            fRXPat->fCompiledPat->setElementAt(op, topLoc);
   1453 
   1454            // Emit the SAVE_STATE
   1455            int32_t   continueLoc = fRXPat->fCompiledPat->size()+1;
   1456            op = buildOp(URX_STATE_SAVE, continueLoc);
   1457            fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
   1458 
   1459            // Emit the LD_SP
   1460            appendOp(URX_LD_SP, stoLoc);
   1461        }
   1462        break;
   1463 
   1464 
   1465    case doBeginMatchMode:
   1466        fNewModeFlags = fModeFlags;
   1467        fSetModeFlag  = true;
   1468        break;
   1469 
   1470    case doMatchMode:   //  (?i)    and similar
   1471        {
   1472            int32_t  bit = 0;
   1473            switch (fC.fChar) {
   1474            case 0x69: /* 'i' */   bit = UREGEX_CASE_INSENSITIVE; break;
   1475            case 0x64: /* 'd' */   bit = UREGEX_UNIX_LINES;       break;
   1476            case 0x6d: /* 'm' */   bit = UREGEX_MULTILINE;        break;
   1477            case 0x73: /* 's' */   bit = UREGEX_DOTALL;           break;
   1478            case 0x75: /* 'u' */   bit = 0; /* Unicode casing */  break;
   1479            case 0x77: /* 'w' */   bit = UREGEX_UWORD;            break;
   1480            case 0x78: /* 'x' */   bit = UREGEX_COMMENTS;         break;
   1481            case 0x2d: /* '-' */   fSetModeFlag = false;          break;
   1482            default:
   1483                UPRV_UNREACHABLE_EXIT;  // Should never happen.  Other chars are filtered out
   1484                                        // by the scanner.
   1485            }
   1486            if (fSetModeFlag) {
   1487                fNewModeFlags |= bit;
   1488            } else {
   1489                fNewModeFlags &= ~bit;
   1490            }
   1491        }
   1492        break;
   1493 
   1494    case doSetMatchMode:
   1495        // Emit code to match any pending literals, using the not-yet changed match mode.
   1496        fixLiterals();
   1497 
   1498        // We've got a (?i) or similar.  The match mode is being changed, but
   1499        //   the change is not scoped to a parenthesized block.
   1500        U_ASSERT(fNewModeFlags < 0);
   1501        fModeFlags = fNewModeFlags;
   1502 
   1503        break;
   1504 
   1505 
   1506    case doMatchModeParen:
   1507        // We've got a (?i: or similar.  Begin a parenthesized block, save old
   1508        //   mode flags so they can be restored at the close of the block.
   1509        //
   1510        //   Compile to a
   1511        //      - NOP, which later may be replaced by a save-state if the
   1512        //         parenthesized group gets a * quantifier, followed by
   1513        //      - NOP, which may later be replaced by a save-state if there
   1514        //             is an '|' alternation within the parens.
   1515        {
   1516            fixLiterals(false);
   1517            appendOp(URX_NOP, 0);
   1518            appendOp(URX_NOP, 0);
   1519 
   1520            // On the Parentheses stack, start a new frame and add the positions
   1521            //   of the two NOPs (a normal non-capturing () frame, except for the
   1522            //   saving of the original mode flags.)
   1523            fParenStack.push(fModeFlags, *fStatus);
   1524            fParenStack.push(flags, *fStatus);                            // Frame Marker
   1525            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP
   1526            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
   1527 
   1528            // Set the current mode flags to the new values.
   1529            U_ASSERT(fNewModeFlags < 0);
   1530            fModeFlags = fNewModeFlags;
   1531        }
   1532        break;
   1533 
   1534    case doBadModeFlag:
   1535        error(U_REGEX_INVALID_FLAG);
   1536        break;
   1537 
   1538    case doSuppressComments:
   1539        // We have just scanned a '(?'.  We now need to prevent the character scanner from
   1540        // treating a '#' as a to-the-end-of-line comment.
   1541        //   (This Perl compatibility just gets uglier and uglier to do...)
   1542        fEOLComments = false;
   1543        break;
   1544 
   1545 
   1546    case doSetAddAmp:
   1547        {
   1548          UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
   1549          set->add(chAmp);
   1550        }
   1551        break;
   1552 
   1553    case doSetAddDash:
   1554        {
   1555          UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
   1556          set->add(chDash);
   1557        }
   1558        break;
   1559 
   1560     case doSetBackslash_s:
   1561        {
   1562         UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
   1563         set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
   1564         break;
   1565        }
   1566 
   1567     case doSetBackslash_S:
   1568        {
   1569            UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
   1570            UnicodeSet SSet;
   1571            SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]).complement();
   1572            set->addAll(SSet);
   1573            break;
   1574        }
   1575 
   1576    case doSetBackslash_d:
   1577        {
   1578            UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
   1579            // TODO - make a static set, ticket 6058.
   1580            addCategory(set, U_GC_ND_MASK, *fStatus);
   1581            break;
   1582        }
   1583 
   1584    case doSetBackslash_D:
   1585        {
   1586            UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
   1587            UnicodeSet digits;
   1588            // TODO - make a static set, ticket 6058.
   1589            digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
   1590            digits.complement();
   1591            set->addAll(digits);
   1592            break;
   1593        }
   1594 
   1595    case doSetBackslash_h:
   1596        {
   1597            UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
   1598            UnicodeSet h;
   1599            h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
   1600            h.add(static_cast<UChar32>(9)); // Tab
   1601            set->addAll(h);
   1602            break;
   1603        }
   1604 
   1605    case doSetBackslash_H:
   1606        {
   1607            UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
   1608            UnicodeSet h;
   1609            h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
   1610            h.add(static_cast<UChar32>(9)); // Tab
   1611            h.complement();
   1612            set->addAll(h);
   1613            break;
   1614        }
   1615 
   1616    case doSetBackslash_v:
   1617        {
   1618            UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
   1619            set->add(static_cast<UChar32>(0x0a), static_cast<UChar32>(0x0d)); // add range
   1620            set->add(static_cast<UChar32>(0x85));
   1621            set->add(static_cast<UChar32>(0x2028), static_cast<UChar32>(0x2029));
   1622            break;
   1623        }
   1624 
   1625    case doSetBackslash_V:
   1626        {
   1627            UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
   1628            UnicodeSet v;
   1629            v.add(static_cast<UChar32>(0x0a), static_cast<UChar32>(0x0d)); // add range
   1630            v.add(static_cast<UChar32>(0x85));
   1631            v.add(static_cast<UChar32>(0x2028), static_cast<UChar32>(0x2029));
   1632            v.complement();
   1633            set->addAll(v);
   1634            break;
   1635        }
   1636 
   1637    case doSetBackslash_w:
   1638        {
   1639            UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
   1640            set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
   1641            break;
   1642        }
   1643 
   1644    case doSetBackslash_W:
   1645        {
   1646            UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
   1647            UnicodeSet SSet;
   1648            SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]).complement();
   1649            set->addAll(SSet);
   1650            break;
   1651        }
   1652 
   1653    case doSetBegin:
   1654        {
   1655            fixLiterals(false);
   1656            LocalPointer<UnicodeSet> lpSet(new UnicodeSet(), *fStatus);
   1657            fSetStack.push(lpSet.orphan(), *fStatus);
   1658            fSetOpStack.push(setStart, *fStatus);
   1659            if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
   1660                fSetOpStack.push(setCaseClose, *fStatus);
   1661            }
   1662            break;
   1663        }
   1664 
   1665    case doSetBeginDifference1:
   1666        //  We have scanned something like [[abc]-[
   1667        //  Set up a new UnicodeSet for the set beginning with the just-scanned '['
   1668        //  Push a Difference operator, which will cause the new set to be subtracted from what
   1669        //    went before once it is created.
   1670        setPushOp(setDifference1);
   1671        fSetOpStack.push(setStart, *fStatus);
   1672        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
   1673            fSetOpStack.push(setCaseClose, *fStatus);
   1674        }
   1675        break;
   1676 
   1677    case doSetBeginIntersection1:
   1678        //  We have scanned something like  [[abc]&[
   1679        //   Need both the '&' operator and the open '[' operator.
   1680        setPushOp(setIntersection1);
   1681        fSetOpStack.push(setStart, *fStatus);
   1682        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
   1683            fSetOpStack.push(setCaseClose, *fStatus);
   1684        }
   1685        break;
   1686 
   1687    case doSetBeginUnion:
   1688        //  We have scanned something like  [[abc][
   1689        //     Need to handle the union operation explicitly [[abc] | [
   1690        setPushOp(setUnion);
   1691        fSetOpStack.push(setStart, *fStatus);
   1692        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
   1693            fSetOpStack.push(setCaseClose, *fStatus);
   1694        }
   1695        break;
   1696 
   1697    case doSetDifference2:
   1698        // We have scanned something like [abc--
   1699        //   Consider this to unambiguously be a set difference operator.
   1700        setPushOp(setDifference2);
   1701        break;
   1702 
   1703    case doSetEnd:
   1704        // Have encountered the ']' that closes a set.
   1705        //    Force the evaluation of any pending operations within this set,
   1706        //    leave the completed set on the top of the set stack.
   1707        setEval(setEnd);
   1708        U_ASSERT(fSetOpStack.peeki()==setStart);
   1709        fSetOpStack.popi();
   1710        break;
   1711 
   1712    case doSetFinish:
   1713        {
   1714        // Finished a complete set expression, including all nested sets.
   1715        //   The close bracket has already triggered clearing out pending set operators,
   1716        //    the operator stack should be empty and the operand stack should have just
   1717        //    one entry, the result set.
   1718        U_ASSERT(fSetOpStack.empty());
   1719        UnicodeSet* theSet = static_cast<UnicodeSet*>(fSetStack.pop());
   1720        U_ASSERT(fSetStack.empty());
   1721        compileSet(theSet);
   1722        break;
   1723        }
   1724 
   1725    case doSetIntersection2:
   1726        // Have scanned something like [abc&&
   1727        setPushOp(setIntersection2);
   1728        break;
   1729 
   1730    case doSetLiteral:
   1731        // Union the just-scanned literal character into the set being built.
   1732        //    This operation is the highest precedence set operation, so we can always do
   1733        //    it immediately, without waiting to see what follows.  It is necessary to perform
   1734        //    any pending '-' or '&' operation first, because these have the same precedence
   1735        //    as union-ing in a literal'
   1736        {
   1737            setEval(setUnion);
   1738            UnicodeSet* s = static_cast<UnicodeSet*>(fSetStack.peek());
   1739            s->add(fC.fChar);
   1740            fLastSetLiteral = fC.fChar;
   1741            break;
   1742        }
   1743 
   1744    case doSetLiteralEscaped:
   1745        // A back-slash escaped literal character was encountered.
   1746        // Processing is the same as with setLiteral, above, with the addition of
   1747        //  the optional check for errors on escaped ASCII letters.
   1748        {
   1749            if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
   1750                ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
   1751                 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
   1752                error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   1753            }
   1754            setEval(setUnion);
   1755            UnicodeSet* s = static_cast<UnicodeSet*>(fSetStack.peek());
   1756            s->add(fC.fChar);
   1757            fLastSetLiteral = fC.fChar;
   1758            break;
   1759        }
   1760 
   1761        case doSetNamedChar:
   1762        // Scanning a \N{UNICODE CHARACTER NAME}
   1763        //  Aside from the source of the character, the processing is identical to doSetLiteral,
   1764        //    above.
   1765        {
   1766            UChar32  c = scanNamedChar();
   1767            setEval(setUnion);
   1768            UnicodeSet* s = static_cast<UnicodeSet*>(fSetStack.peek());
   1769            s->add(c);
   1770            fLastSetLiteral = c;
   1771            break;
   1772        }
   1773 
   1774    case doSetNamedRange:
   1775        // We have scanned literal-\N{CHAR NAME}.  Add the range to the set.
   1776        // The left character is already in the set, and is saved in fLastSetLiteral.
   1777        // The right side needs to be picked up, the scan is at the 'N'.
   1778        // Lower Limit > Upper limit being an error matches both Java
   1779        //        and ICU UnicodeSet behavior.
   1780        {
   1781            UChar32  c = scanNamedChar();
   1782            if (U_SUCCESS(*fStatus) && (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > c)) {
   1783                error(U_REGEX_INVALID_RANGE);
   1784            }
   1785            UnicodeSet* s = static_cast<UnicodeSet*>(fSetStack.peek());
   1786            s->add(fLastSetLiteral, c);
   1787            fLastSetLiteral = c;
   1788            break;
   1789        }
   1790 
   1791 
   1792    case  doSetNegate:
   1793        // Scanned a '^' at the start of a set.
   1794        // Push the negation operator onto the set op stack.
   1795        // A twist for case-insensitive matching:
   1796        //   the case closure operation must happen _before_ negation.
   1797        //   But the case closure operation will already be on the stack if it's required.
   1798        //   This requires checking for case closure, and swapping the stack order
   1799        //    if it is present.
   1800        {
   1801            int32_t  tosOp = fSetOpStack.peeki();
   1802            if (tosOp == setCaseClose) {
   1803                fSetOpStack.popi();
   1804                fSetOpStack.push(setNegation, *fStatus);
   1805                fSetOpStack.push(setCaseClose, *fStatus);
   1806            } else {
   1807                fSetOpStack.push(setNegation, *fStatus);
   1808            }
   1809        }
   1810        break;
   1811 
   1812    case doSetNoCloseError:
   1813        error(U_REGEX_MISSING_CLOSE_BRACKET);
   1814        break;
   1815 
   1816    case doSetOpError:
   1817        error(U_REGEX_RULE_SYNTAX);   //  -- or && at the end of a set.  Illegal.
   1818        break;
   1819 
   1820    case doSetPosixProp:
   1821        {
   1822            UnicodeSet *s = scanPosixProp();
   1823            if (s != nullptr) {
   1824                UnicodeSet* tos = static_cast<UnicodeSet*>(fSetStack.peek());
   1825                tos->addAll(*s);
   1826                delete s;
   1827            }  // else error.  scanProp() reported the error status already.
   1828        }
   1829        break;
   1830 
   1831    case doSetProp:
   1832        //  Scanned a \p \P within [brackets].
   1833        {
   1834            UnicodeSet *s = scanProp();
   1835            if (s != nullptr) {
   1836                UnicodeSet* tos = static_cast<UnicodeSet*>(fSetStack.peek());
   1837                tos->addAll(*s);
   1838                delete s;
   1839            }  // else error.  scanProp() reported the error status already.
   1840        }
   1841        break;
   1842 
   1843 
   1844    case doSetRange:
   1845        // We have scanned literal-literal.  Add the range to the set.
   1846        // The left character is already in the set, and is saved in fLastSetLiteral.
   1847        // The right side is the current character.
   1848        // Lower Limit > Upper limit being an error matches both Java
   1849        //        and ICU UnicodeSet behavior.
   1850        {
   1851 
   1852        if (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > fC.fChar) {
   1853            error(U_REGEX_INVALID_RANGE);
   1854        }
   1855        UnicodeSet* s = static_cast<UnicodeSet*>(fSetStack.peek());
   1856        s->add(fLastSetLiteral, fC.fChar);
   1857        break;
   1858        }
   1859 
   1860    default:
   1861        UPRV_UNREACHABLE_EXIT;
   1862    }
   1863 
   1864    if (U_FAILURE(*fStatus)) {
   1865        returnVal = false;
   1866    }
   1867 
   1868    return returnVal;
   1869 }
   1870 
   1871 
   1872 
   1873 //------------------------------------------------------------------------------
   1874 //
   1875 //   literalChar           We've encountered a literal character from the pattern,
   1876 //                             or an escape sequence that reduces to a character.
   1877 //                         Add it to the string containing all literal chars/strings from
   1878 //                             the pattern.
   1879 //
   1880 //------------------------------------------------------------------------------
   1881 void RegexCompile::literalChar(UChar32 c)  {
   1882    fLiteralChars.append(c);
   1883 }
   1884 
   1885 
   1886 //------------------------------------------------------------------------------
   1887 //
   1888 //    fixLiterals           When compiling something that can follow a literal
   1889 //                          string in a pattern, emit the code to match the
   1890 //                          accumulated literal string.
   1891 //
   1892 //                          Optionally, split the last char of the string off into
   1893 //                          a single "ONE_CHAR" operation, so that quantifiers can
   1894 //                          apply to that char alone.  Example:   abc*
   1895 //                          The * must apply to the 'c' only.
   1896 //
   1897 //------------------------------------------------------------------------------
   1898 void    RegexCompile::fixLiterals(UBool split) {
   1899 
   1900    // If no literal characters have been scanned but not yet had code generated
   1901    //   for them, nothing needs to be done.
   1902    if (fLiteralChars.length() == 0) {
   1903        return;
   1904    }
   1905 
   1906    int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
   1907    UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
   1908 
   1909    // Split:  We need to  ensure that the last item in the compiled pattern
   1910    //     refers only to the last literal scanned in the pattern, so that
   1911    //     quantifiers (*, +, etc.) affect only it, and not a longer string.
   1912    //     Split before case folding for case insensitive matches.
   1913 
   1914    if (split) {
   1915        fLiteralChars.truncate(indexOfLastCodePoint);
   1916        fixLiterals(false);   // Recursive call, emit code to match the first part of the string.
   1917                              //  Note that the truncated literal string may be empty, in which case
   1918                              //  nothing will be emitted.
   1919 
   1920        literalChar(lastCodePoint);  // Re-add the last code point as if it were a new literal.
   1921        fixLiterals(false);          // Second recursive call, code for the final code point.
   1922        return;
   1923    }
   1924 
   1925    // If we are doing case-insensitive matching, case fold the string.  This may expand
   1926    //   the string, e.g. the German sharp-s turns into "ss"
   1927    if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
   1928        fLiteralChars.foldCase();
   1929        indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
   1930        lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
   1931    }
   1932 
   1933    if (indexOfLastCodePoint == 0) {
   1934        // Single character, emit a URX_ONECHAR op to match it.
   1935        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
   1936                 u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
   1937            appendOp(URX_ONECHAR_I, lastCodePoint);
   1938        } else {
   1939            appendOp(URX_ONECHAR, lastCodePoint);
   1940        }
   1941    } else {
   1942        // Two or more chars, emit a URX_STRING to match them.
   1943        if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length() > 0x00ffffff) {
   1944            error(U_REGEX_PATTERN_TOO_BIG);
   1945        }
   1946        if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
   1947            appendOp(URX_STRING_I, fRXPat->fLiteralText.length());
   1948        } else {
   1949            // TODO here:  add optimization to split case sensitive strings of length two
   1950            //             into two single char ops, for efficiency.
   1951            appendOp(URX_STRING, fRXPat->fLiteralText.length());
   1952        }
   1953        appendOp(URX_STRING_LEN, fLiteralChars.length());
   1954 
   1955        // Add this string into the accumulated strings of the compiled pattern.
   1956        fRXPat->fLiteralText.append(fLiteralChars);
   1957    }
   1958 
   1959    fLiteralChars.remove();
   1960 }
   1961 
   1962 
   1963 int32_t RegexCompile::buildOp(int32_t type, int32_t val) {
   1964    if (U_FAILURE(*fStatus)) {
   1965        return 0;
   1966    }
   1967    if (type < 0 || type > 255) {
   1968        UPRV_UNREACHABLE_EXIT;
   1969    }
   1970    if (val > 0x00ffffff) {
   1971        UPRV_UNREACHABLE_EXIT;
   1972    }
   1973    if (val < 0) {
   1974        if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) {
   1975            UPRV_UNREACHABLE_EXIT;
   1976        }
   1977        if (URX_TYPE(val) != 0xff) {
   1978            UPRV_UNREACHABLE_EXIT;
   1979        }
   1980        type = URX_RESERVED_OP_N;
   1981    }
   1982    return (type << 24) | val;
   1983 }
   1984 
   1985 
   1986 //------------------------------------------------------------------------------
   1987 //
   1988 //   appendOp()             Append a new instruction onto the compiled pattern
   1989 //                          Includes error checking, limiting the size of the
   1990 //                          pattern to lengths that can be represented in the
   1991 //                          24 bit operand field of an instruction.
   1992 //
   1993 //------------------------------------------------------------------------------
   1994 void RegexCompile::appendOp(int32_t op) {
   1995    if (U_FAILURE(*fStatus)) {
   1996        return;
   1997    }
   1998    fRXPat->fCompiledPat->addElement(op, *fStatus);
   1999    if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) {
   2000        error(U_REGEX_PATTERN_TOO_BIG);
   2001    }
   2002 }
   2003 
   2004 void RegexCompile::appendOp(int32_t type, int32_t val) {
   2005    appendOp(buildOp(type, val));
   2006 }
   2007 
   2008 
   2009 //------------------------------------------------------------------------------
   2010 //
   2011 //   insertOp()             Insert a slot for a new opcode into the already
   2012 //                          compiled pattern code.
   2013 //
   2014 //                          Fill the slot with a NOP.  Our caller will replace it
   2015 //                          with what they really wanted.
   2016 //
   2017 //------------------------------------------------------------------------------
   2018 void   RegexCompile::insertOp(int32_t where) {
   2019    UVector64 *code = fRXPat->fCompiledPat;
   2020    U_ASSERT(where>0 && where < code->size());
   2021 
   2022    int32_t  nop = buildOp(URX_NOP, 0);
   2023    code->insertElementAt(nop, where, *fStatus);
   2024 
   2025    // Walk through the pattern, looking for any ops with targets that
   2026    //  were moved down by the insert.  Fix them.
   2027    int32_t loc;
   2028    for (loc=0; loc<code->size(); loc++) {
   2029        int32_t op = static_cast<int32_t>(code->elementAti(loc));
   2030        int32_t opType = URX_TYPE(op);
   2031        int32_t opValue = URX_VAL(op);
   2032        if ((opType == URX_JMP         ||
   2033            opType == URX_JMPX         ||
   2034            opType == URX_STATE_SAVE   ||
   2035            opType == URX_CTR_LOOP     ||
   2036            opType == URX_CTR_LOOP_NG  ||
   2037            opType == URX_JMP_SAV      ||
   2038            opType == URX_JMP_SAV_X    ||
   2039            opType == URX_RELOC_OPRND)    && opValue > where) {
   2040            // Target location for this opcode is after the insertion point and
   2041            //   needs to be incremented to adjust for the insertion.
   2042            opValue++;
   2043            op = buildOp(opType, opValue);
   2044            code->setElementAt(op, loc);
   2045        }
   2046    }
   2047 
   2048    // Now fix up the parentheses stack.  All positive values in it are locations in
   2049    //  the compiled pattern.   (Negative values are frame boundaries, and don't need fixing.)
   2050    for (loc=0; loc<fParenStack.size(); loc++) {
   2051        int32_t x = fParenStack.elementAti(loc);
   2052        U_ASSERT(x < code->size());
   2053        if (x>where) {
   2054            x++;
   2055            fParenStack.setElementAt(x, loc);
   2056        }
   2057    }
   2058 
   2059    if (fMatchCloseParen > where) {
   2060        fMatchCloseParen++;
   2061    }
   2062    if (fMatchOpenParen > where) {
   2063        fMatchOpenParen++;
   2064    }
   2065 }
   2066 
   2067 
   2068 //------------------------------------------------------------------------------
   2069 //
   2070 //   allocateData()        Allocate storage in the matcher's static data area.
   2071 //                         Return the index for the newly allocated data.
   2072 //                         The storage won't actually exist until we are running a match
   2073 //                         operation, but the storage indexes are inserted into various
   2074 //                         opcodes while compiling the pattern.
   2075 //
   2076 //------------------------------------------------------------------------------
   2077 int32_t RegexCompile::allocateData(int32_t size) {
   2078    if (U_FAILURE(*fStatus)) {
   2079        return 0;
   2080    }
   2081    if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) {
   2082        error(U_REGEX_INTERNAL_ERROR);
   2083        return 0;
   2084    }
   2085    int32_t dataIndex = fRXPat->fDataSize;
   2086    fRXPat->fDataSize += size;
   2087    if (fRXPat->fDataSize >= 0x00fffff0) {
   2088        error(U_REGEX_INTERNAL_ERROR);
   2089    }
   2090    return dataIndex;
   2091 }
   2092 
   2093 
   2094 //------------------------------------------------------------------------------
   2095 //
   2096 //   allocateStackData()   Allocate space in the back-tracking stack frame.
   2097 //                         Return the index for the newly allocated data.
   2098 //                         The frame indexes are inserted into various
   2099 //                         opcodes while compiling the pattern, meaning that frame
   2100 //                         size must be restricted to the size that will fit
   2101 //                         as an operand (24 bits).
   2102 //
   2103 //------------------------------------------------------------------------------
   2104 int32_t RegexCompile::allocateStackData(int32_t size) {
   2105    if (U_FAILURE(*fStatus)) {
   2106        return 0;
   2107    }
   2108    if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) {
   2109        error(U_REGEX_INTERNAL_ERROR);
   2110        return 0;
   2111    }
   2112    int32_t dataIndex = fRXPat->fFrameSize;
   2113    fRXPat->fFrameSize += size;
   2114    if (fRXPat->fFrameSize >= 0x00fffff0) {
   2115        error(U_REGEX_PATTERN_TOO_BIG);
   2116    }
   2117    return dataIndex;
   2118 }
   2119 
   2120 
   2121 //------------------------------------------------------------------------------
   2122 //
   2123 //   blockTopLoc()          Find or create a location in the compiled pattern
   2124 //                          at the start of the operation or block that has
   2125 //                          just been compiled.  Needed when a quantifier (* or
   2126 //                          whatever) appears, and we need to add an operation
   2127 //                          at the start of the thing being quantified.
   2128 //
   2129 //                          (Parenthesized Blocks) have a slot with a NOP that
   2130 //                          is reserved for this purpose.  .* or similar don't
   2131 //                          and a slot needs to be added.
   2132 //
   2133 //       parameter reserveLoc   :  true -  ensure that there is space to add an opcode
   2134 //                                         at the returned location.
   2135 //                                 false - just return the address,
   2136 //                                         do not reserve a location there.
   2137 //
   2138 //------------------------------------------------------------------------------
   2139 int32_t   RegexCompile::blockTopLoc(UBool reserveLoc) {
   2140    int32_t   theLoc;
   2141    fixLiterals(true);  // Emit code for any pending literals.
   2142                        //   If last item was a string, emit separate op for the its last char.
   2143    if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
   2144    {
   2145        // The item just processed is a parenthesized block.
   2146        theLoc = fMatchOpenParen;   // A slot is already reserved for us.
   2147        U_ASSERT(theLoc > 0);
   2148        U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
   2149    }
   2150    else {
   2151        // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
   2152        // No slot for STATE_SAVE was pre-reserved in the compiled code.
   2153        // We need to make space now.
   2154        theLoc = fRXPat->fCompiledPat->size()-1;
   2155        int32_t opAtTheLoc = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(theLoc));
   2156        if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) {
   2157            // Strings take two opcode, we want the position of the first one.
   2158            // We can have a string at this point if a single character case-folded to two.
   2159            theLoc--;
   2160        }
   2161        if (reserveLoc) {
   2162            int32_t  nop = buildOp(URX_NOP, 0);
   2163            fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
   2164        }
   2165    }
   2166    return theLoc;
   2167 }
   2168 
   2169 
   2170 
   2171 //------------------------------------------------------------------------------
   2172 //
   2173 //    handleCloseParen      When compiling a close paren, we need to go back
   2174 //                          and fix up any JMP or SAVE operations within the
   2175 //                          parenthesized block that need to target the end
   2176 //                          of the block.  The locations of these are kept on
   2177 //                          the paretheses stack.
   2178 //
   2179 //                          This function is called both when encountering a
   2180 //                          real ) and at the end of the pattern.
   2181 //
   2182 //------------------------------------------------------------------------------
   2183 void  RegexCompile::handleCloseParen() {
   2184    int32_t   patIdx;
   2185    int32_t   patOp;
   2186    if (fParenStack.size() <= 0) {
   2187        error(U_REGEX_MISMATCHED_PAREN);
   2188        return;
   2189    }
   2190 
   2191    // Emit code for any pending literals.
   2192    fixLiterals(false);
   2193 
   2194    // Fixup any operations within the just-closed parenthesized group
   2195    //    that need to reference the end of the (block).
   2196    //    (The first one popped from the stack is an unused slot for
   2197    //     alternation (OR) state save, but applying the fixup to it does no harm.)
   2198    for (;;) {
   2199        patIdx = fParenStack.popi();
   2200        if (patIdx < 0) {
   2201            // value < 0 flags the start of the frame on the paren stack.
   2202            break;
   2203        }
   2204        U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
   2205        patOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(patIdx));
   2206        U_ASSERT(URX_VAL(patOp) == 0);          // Branch target for JMP should not be set.
   2207        patOp |= fRXPat->fCompiledPat->size();  // Set it now.
   2208        fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
   2209        fMatchOpenParen     = patIdx;
   2210    }
   2211 
   2212    //  At the close of any parenthesized block, restore the match mode flags  to
   2213    //  the value they had at the open paren.  Saved value is
   2214    //  at the top of the paren stack.
   2215    fModeFlags = fParenStack.popi();
   2216    U_ASSERT(fModeFlags < 0);
   2217 
   2218    // DO any additional fixups, depending on the specific kind of
   2219    // parentesized grouping this is
   2220 
   2221    switch (patIdx) {
   2222    case plain:
   2223    case flags:
   2224        // No additional fixups required.
   2225        //   (Grouping-only parentheses)
   2226        break;
   2227    case capturing:
   2228        // Capturing Parentheses.
   2229        //   Insert a End Capture op into the pattern.
   2230        //   The frame offset of the variables for this cg is obtained from the
   2231        //       start capture op and put it into the end-capture op.
   2232        {
   2233            int32_t captureOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(fMatchOpenParen + 1));
   2234            U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
   2235 
   2236            int32_t   frameVarLocation = URX_VAL(captureOp);
   2237            appendOp(URX_END_CAPTURE, frameVarLocation);
   2238        }
   2239        break;
   2240    case atomic:
   2241        // Atomic Parenthesis.
   2242        //   Insert a LD_SP operation to restore the state stack to the position
   2243        //   it was when the atomic parens were entered.
   2244        {
   2245            int32_t stoOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(fMatchOpenParen + 1));
   2246            U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
   2247            int32_t   stoLoc = URX_VAL(stoOp);
   2248            appendOp(URX_LD_SP, stoLoc);
   2249        }
   2250        break;
   2251 
   2252    case lookAhead:
   2253        {
   2254            int32_t startOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(fMatchOpenParen - 5));
   2255            U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
   2256            int32_t dataLoc  = URX_VAL(startOp);
   2257            appendOp(URX_LA_END, dataLoc);
   2258        }
   2259        break;
   2260 
   2261    case negLookAhead:
   2262        {
   2263            // See comment at doOpenLookAheadNeg
   2264            int32_t startOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(fMatchOpenParen - 1));
   2265            U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
   2266            int32_t dataLoc  = URX_VAL(startOp);
   2267            appendOp(URX_LA_END, dataLoc);
   2268            appendOp(URX_BACKTRACK, 0);
   2269            appendOp(URX_LA_END, dataLoc);
   2270 
   2271            // Patch the URX_SAVE near the top of the block.
   2272            // The destination of the SAVE is the final LA_END that was just added.
   2273            int32_t saveOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(fMatchOpenParen));
   2274            U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
   2275            int32_t dest     = fRXPat->fCompiledPat->size()-1;
   2276            saveOp           = buildOp(URX_STATE_SAVE, dest);
   2277            fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
   2278        }
   2279        break;
   2280 
   2281    case lookBehind:
   2282        {
   2283            // See comment at doOpenLookBehind.
   2284 
   2285            // Append the URX_LB_END and URX_LA_END to the compiled pattern.
   2286            int32_t startOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(fMatchOpenParen - 4));
   2287            U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
   2288            int32_t dataLoc  = URX_VAL(startOp);
   2289            appendOp(URX_LB_END, dataLoc);
   2290            appendOp(URX_LA_END, dataLoc);
   2291 
   2292            // Determine the min and max bounds for the length of the
   2293            //  string that the pattern can match.
   2294            //  An unbounded upper limit is an error.
   2295            int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
   2296            int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
   2297            int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
   2298            if (URX_TYPE(maxML) != 0) {
   2299                error(U_REGEX_LOOK_BEHIND_LIMIT);
   2300                break;
   2301            }
   2302            if (maxML == INT32_MAX) {
   2303                error(U_REGEX_LOOK_BEHIND_LIMIT);
   2304                break;
   2305            }
   2306            if (minML == INT32_MAX) {
   2307                // This condition happens when no match is possible, such as with a
   2308                // [set] expression containing no elements.
   2309                // In principle, the generated code to evaluate the expression could be deleted,
   2310                // but it's probably not worth the complication.
   2311                minML = 0;
   2312            }
   2313            U_ASSERT(minML <= maxML);
   2314 
   2315            // Insert the min and max match len bounds into the URX_LB_CONT op that
   2316            //  appears at the top of the look-behind block, at location fMatchOpenParen+1
   2317            fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-2);
   2318            fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-1);
   2319 
   2320        }
   2321        break;
   2322 
   2323 
   2324 
   2325    case lookBehindN:
   2326        {
   2327            // See comment at doOpenLookBehindNeg.
   2328 
   2329            // Append the URX_LBN_END to the compiled pattern.
   2330            int32_t startOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(fMatchOpenParen - 5));
   2331            U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
   2332            int32_t dataLoc  = URX_VAL(startOp);
   2333            appendOp(URX_LBN_END, dataLoc);
   2334 
   2335            // Determine the min and max bounds for the length of the
   2336            //  string that the pattern can match.
   2337            //  An unbounded upper limit is an error.
   2338            int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
   2339            int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
   2340            int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
   2341            if (URX_TYPE(maxML) != 0) {
   2342                error(U_REGEX_LOOK_BEHIND_LIMIT);
   2343                break;
   2344            }
   2345            if (maxML == INT32_MAX) {
   2346                error(U_REGEX_LOOK_BEHIND_LIMIT);
   2347                break;
   2348            }
   2349            if (minML == INT32_MAX) {
   2350                // This condition happens when no match is possible, such as with a
   2351                // [set] expression containing no elements.
   2352                // In principle, the generated code to evaluate the expression could be deleted,
   2353                // but it's probably not worth the complication.
   2354                minML = 0;
   2355            }
   2356 
   2357            U_ASSERT(minML <= maxML);
   2358 
   2359            // Insert the min and max match len bounds into the URX_LB_CONT op that
   2360            //  appears at the top of the look-behind block, at location fMatchOpenParen+1
   2361            fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-3);
   2362            fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-2);
   2363 
   2364            // Insert the pattern location to continue at after a successful match
   2365            //  as the last operand of the URX_LBN_CONT
   2366            int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
   2367            fRXPat->fCompiledPat->setElementAt(op,  fMatchOpenParen-1);
   2368        }
   2369        break;
   2370 
   2371 
   2372 
   2373    default:
   2374        UPRV_UNREACHABLE_EXIT;
   2375    }
   2376 
   2377    // remember the next location in the compiled pattern.
   2378    // The compilation of Quantifiers will look at this to see whether its looping
   2379    //   over a parenthesized block or a single item
   2380    fMatchCloseParen = fRXPat->fCompiledPat->size();
   2381 }
   2382 
   2383 
   2384 
   2385 //------------------------------------------------------------------------------
   2386 //
   2387 //   compileSet       Compile the pattern operations for a reference to a
   2388 //                    UnicodeSet.
   2389 //
   2390 //------------------------------------------------------------------------------
   2391 void        RegexCompile::compileSet(UnicodeSet *theSet)
   2392 {
   2393    if (theSet == nullptr) {
   2394        return;
   2395    }
   2396    //  Remove any strings from the set.
   2397    //  There shouldn't be any, but just in case.
   2398    //     (Case Closure can add them; if we had a simple case closure available that
   2399    //      ignored strings, that would be better.)
   2400    theSet->removeAllStrings();
   2401    int32_t  setSize = theSet->size();
   2402 
   2403    switch (setSize) {
   2404    case 0:
   2405        {
   2406            // Set of no elements.   Always fails to match.
   2407            appendOp(URX_BACKTRACK, 0);
   2408            delete theSet;
   2409        }
   2410        break;
   2411 
   2412    case 1:
   2413        {
   2414            // The set contains only a single code point.  Put it into
   2415            //   the compiled pattern as a single char operation rather
   2416            //   than a set, and discard the set itself.
   2417            literalChar(theSet->charAt(0));
   2418            delete theSet;
   2419        }
   2420        break;
   2421 
   2422    default:
   2423        {
   2424            //  The set contains two or more chars.  (the normal case)
   2425            //  Put it into the compiled pattern as a set.
   2426            theSet->freeze();
   2427            int32_t setNumber = fRXPat->fSets->size();
   2428            fRXPat->fSets->addElement(theSet, *fStatus);
   2429            if (U_SUCCESS(*fStatus)) {
   2430                appendOp(URX_SETREF, setNumber);
   2431            } else {
   2432                delete theSet;
   2433            }
   2434        }
   2435    }
   2436 }
   2437 
   2438 
   2439 //------------------------------------------------------------------------------
   2440 //
   2441 //   compileInterval    Generate the code for a {min, max} style interval quantifier.
   2442 //                      Except for the specific opcodes used, the code is the same
   2443 //                      for all three types (greedy, non-greedy, possessive) of
   2444 //                      intervals.  The opcodes are supplied as parameters.
   2445 //                      (There are two sets of opcodes - greedy & possessive use the
   2446 //                      same ones, while non-greedy has it's own.)
   2447 //
   2448 //                      The code for interval loops has this form:
   2449 //                         0  CTR_INIT   counter loc (in stack frame)
   2450 //                         1             5  patt address of CTR_LOOP at bottom of block
   2451 //                         2             min count
   2452 //                         3             max count   (-1 for unbounded)
   2453 //                         4  ...        block to be iterated over
   2454 //                         5  CTR_LOOP
   2455 //
   2456 //                       In
   2457 //------------------------------------------------------------------------------
   2458 void        RegexCompile::compileInterval(int32_t InitOp,  int32_t LoopOp)
   2459 {
   2460    // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
   2461    //   four slots in the compiled code.  Reserve them.
   2462    int32_t   topOfBlock = blockTopLoc(true);
   2463    insertOp(topOfBlock);
   2464    insertOp(topOfBlock);
   2465    insertOp(topOfBlock);
   2466 
   2467    // The operands for the CTR_INIT opcode include the index in the matcher data
   2468    //   of the counter.  Allocate it now. There are two data items
   2469    //        counterLoc   -->  Loop counter
   2470    //               +1    -->  Input index (for breaking non-progressing loops)
   2471    //                          (Only present if unbounded upper limit on loop)
   2472    int32_t   dataSize = fIntervalUpper < 0 ? 2 : 1;
   2473    int32_t   counterLoc = allocateStackData(dataSize);
   2474 
   2475    int32_t   op = buildOp(InitOp, counterLoc);
   2476    fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
   2477 
   2478    // The second operand of CTR_INIT is the location following the end of the loop.
   2479    //   Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
   2480    //   compilation of something later on causes the code to grow and the target
   2481    //   position to move.
   2482    int32_t loopEnd = fRXPat->fCompiledPat->size();
   2483    op = buildOp(URX_RELOC_OPRND, loopEnd);
   2484    fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
   2485 
   2486    // Followed by the min and max counts.
   2487    fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
   2488    fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
   2489 
   2490    // Append the CTR_LOOP op.  The operand is the location of the CTR_INIT op.
   2491    //   Goes at end of the block being looped over, so just append to the code so far.
   2492    appendOp(LoopOp, topOfBlock);
   2493 
   2494    if ((fIntervalLow & 0xff000000) != 0 ||
   2495        (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
   2496            error(U_REGEX_NUMBER_TOO_BIG);
   2497        }
   2498 
   2499    if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
   2500        error(U_REGEX_MAX_LT_MIN);
   2501    }
   2502 }
   2503 
   2504 
   2505 
   2506 UBool RegexCompile::compileInlineInterval() {
   2507    if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
   2508        // Too big to inline.  Fail, which will cause looping code to be generated.
   2509        //   (Upper < Lower picks up unbounded upper and errors, both.)
   2510        return false;
   2511    }
   2512 
   2513    int32_t   topOfBlock = blockTopLoc(false);
   2514    if (fIntervalUpper == 0) {
   2515        // Pathological case.  Attempt no matches, as if the block doesn't exist.
   2516        // Discard the generated code for the block.
   2517        // If the block included parens, discard the info pertaining to them as well.
   2518        fRXPat->fCompiledPat->setSize(topOfBlock);
   2519        if (fMatchOpenParen >= topOfBlock) {
   2520            fMatchOpenParen = -1;
   2521        }
   2522        if (fMatchCloseParen >= topOfBlock) {
   2523            fMatchCloseParen = -1;
   2524        }
   2525        return true;
   2526    }
   2527 
   2528    if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
   2529        // The thing being repeated is not a single op, but some
   2530        //   more complex block.  Do it as a loop, not inlines.
   2531        //   Note that things "repeated" a max of once are handled as inline, because
   2532        //     the one copy of the code already generated is just fine.
   2533        return false;
   2534    }
   2535 
   2536    // Pick up the opcode that is to be repeated
   2537    //
   2538    int32_t op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(topOfBlock));
   2539 
   2540    // Compute the pattern location where the inline sequence
   2541    //   will end, and set up the state save op that will be needed.
   2542    //
   2543    int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
   2544                                + fIntervalUpper + (fIntervalUpper-fIntervalLow);
   2545    int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc);
   2546    if (fIntervalLow == 0) {
   2547        insertOp(topOfBlock);
   2548        fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
   2549    }
   2550 
   2551 
   2552 
   2553    //  Loop, emitting the op for the thing being repeated each time.
   2554    //    Loop starts at 1 because one instance of the op already exists in the pattern,
   2555    //    it was put there when it was originally encountered.
   2556    int32_t i;
   2557    for (i=1; i<fIntervalUpper; i++ ) {
   2558        if (i >= fIntervalLow) {
   2559            appendOp(saveOp);
   2560        }
   2561        appendOp(op);
   2562    }
   2563    return true;
   2564 }
   2565 
   2566 
   2567 
   2568 //------------------------------------------------------------------------------
   2569 //
   2570 //   caseInsensitiveStart  given a single code point from a pattern string, determine the 
   2571 //                         set of characters that could potentially begin a case-insensitive 
   2572 //                         match of a string beginning with that character, using full Unicode
   2573 //                         case insensitive matching.
   2574 //
   2575 //          This is used in optimizing find().
   2576 //
   2577 //          closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
   2578 //          misses cases like this:
   2579 //             A string from the pattern begins with 'ss' (although all we know
   2580 //                 in this context is that it begins with 's')
   2581 //             The pattern could match a string beginning with a German sharp-s
   2582 //
   2583 //           To the ordinary case closure for a character c, we add all other
   2584 //           characters cx where the case closure of cx includes a string form that begins
   2585 //           with the original character c.
   2586 //
   2587 //           This function could be made smarter. The full pattern string is available
   2588 //           and it would be possible to verify that the extra characters being added
   2589 //           to the starting set fully match, rather than having just a first-char of the
   2590 //           folded form match.
   2591 //
   2592 //------------------------------------------------------------------------------
   2593 void  RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) {
   2594 
   2595 // Machine Generated below.
   2596 // It may need updating with new versions of Unicode.
   2597 // Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
   2598 // The update tool is here:
   2599 // https://github.com/unicode-org/icu/tree/main/tools/unicode/c/genregexcasing
   2600 
   2601 // Machine Generated Data. Do not hand edit.
   2602    static const UChar32 RECaseFixCodePoints[] = {
   2603        0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc, 
   2604        0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565, 
   2605        0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07, 
   2606        0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61, 
   2607        0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
   2608 
   2609    static const int16_t RECaseFixStringOffsets[] = {
   2610        0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10, 
   2611        0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f, 
   2612        0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43, 
   2613        0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57, 
   2614        0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
   2615 
   2616    static const int16_t RECaseFixCounts[] = {
   2617        0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1, 
   2618        0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1, 
   2619        0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 
   2620        0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 
   2621        0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
   2622 
   2623    static const char16_t RECaseFixData[] = {
   2624        0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf, 
   2625        0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3, 
   2626        0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3, 
   2627        0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3, 
   2628        0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14, 
   2629        0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83, 
   2630        0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90, 
   2631        0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95, 
   2632        0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2, 
   2633        0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7, 
   2634        0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
   2635 
   2636 // End of machine generated data.
   2637 
   2638    if (c < UCHAR_MIN_VALUE || c > UCHAR_MAX_VALUE) {
   2639        // This function should never be called with an invalid input character.
   2640        UPRV_UNREACHABLE_EXIT;
   2641    } else if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
   2642        UChar32 caseFoldedC  = u_foldCase(c, U_FOLD_CASE_DEFAULT);
   2643        starterChars->set(caseFoldedC, caseFoldedC);
   2644 
   2645        int32_t i;
   2646        for (i=0; RECaseFixCodePoints[i]<c ; i++) {
   2647            // Simple linear search through the sorted list of interesting code points.
   2648        }
   2649 
   2650        if (RECaseFixCodePoints[i] == c) {
   2651            int32_t dataIndex = RECaseFixStringOffsets[i];
   2652            int32_t numCharsToAdd = RECaseFixCounts[i];
   2653            UChar32 cpToAdd = 0;
   2654            for (int32_t j=0; j<numCharsToAdd; j++) {
   2655                U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd);
   2656                starterChars->add(cpToAdd);
   2657            }
   2658        }
   2659 
   2660        starterChars->closeOver(USET_CASE_INSENSITIVE);
   2661        starterChars->removeAllStrings();
   2662    } else {
   2663        // Not a cased character. Just return it alone.
   2664        starterChars->set(c, c);
   2665    }
   2666 }
   2667 
   2668 
   2669 // Increment with overflow check.
   2670 // val and delta will both be positive.
   2671 
   2672 static int32_t safeIncrement(int32_t val, int32_t delta) {
   2673    if (INT32_MAX - val > delta) {
   2674        return val + delta;
   2675    } else {
   2676        return INT32_MAX;
   2677    }
   2678 }
   2679 
   2680 
   2681 //------------------------------------------------------------------------------
   2682 //
   2683 //   matchStartType    Determine how a match can start.
   2684 //                     Used to optimize find() operations.
   2685 //
   2686 //                     Operation is very similar to minMatchLength().  Walk the compiled
   2687 //                     pattern, keeping an on-going minimum-match-length.  For any
   2688 //                     op where the min match coming in is zero, add that ops possible
   2689 //                     starting matches to the possible starts for the overall pattern.
   2690 //
   2691 //------------------------------------------------------------------------------
   2692 void   RegexCompile::matchStartType() {
   2693    if (U_FAILURE(*fStatus)) {
   2694        return;
   2695    }
   2696 
   2697 
   2698    int32_t    loc;                    // Location in the pattern of the current op being processed.
   2699    int32_t    op;                     // The op being processed
   2700    int32_t    opType;                 // The opcode type of the op
   2701    int32_t    currentLen = 0;         // Minimum length of a match to this point (loc) in the pattern
   2702    int32_t    numInitialStrings = 0;  // Number of strings encountered that could match at start.
   2703 
   2704    UBool      atStart = true;         // True if no part of the pattern yet encountered
   2705                                       //   could have advanced the position in a match.
   2706                                       //   (Maximum match length so far == 0)
   2707 
   2708    // forwardedLength is a vector holding minimum-match-length values that
   2709    //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
   2710    //   It must be one longer than the pattern being checked because some  ops
   2711    //   will jmp to a end-of-block+1 location from within a block, and we must
   2712    //   count those when checking the block.
   2713    int32_t end = fRXPat->fCompiledPat->size();
   2714    UVector32  forwardedLength(end+1, *fStatus);
   2715    forwardedLength.setSize(end+1);
   2716    for (loc=3; loc<end; loc++) {
   2717        forwardedLength.setElementAt(INT32_MAX, loc);
   2718    }
   2719 
   2720    for (loc = 3; loc<end; loc++) {
   2721        op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
   2722        opType = URX_TYPE(op);
   2723 
   2724        // The loop is advancing linearly through the pattern.
   2725        // If the op we are now at was the destination of a branch in the pattern,
   2726        // and that path has a shorter minimum length than the current accumulated value,
   2727        // replace the current accumulated value.
   2728        if (forwardedLength.elementAti(loc) < currentLen) {
   2729            currentLen = forwardedLength.elementAti(loc);
   2730            U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
   2731        }
   2732 
   2733        switch (opType) {
   2734            // Ops that don't change the total length matched
   2735        case URX_RESERVED_OP:
   2736        case URX_END:
   2737        case URX_FAIL:
   2738        case URX_STRING_LEN:
   2739        case URX_NOP:
   2740        case URX_START_CAPTURE:
   2741        case URX_END_CAPTURE:
   2742        case URX_BACKSLASH_B:
   2743        case URX_BACKSLASH_BU:
   2744        case URX_BACKSLASH_G:
   2745        case URX_BACKSLASH_Z:
   2746        case URX_DOLLAR:
   2747        case URX_DOLLAR_M:
   2748        case URX_DOLLAR_D:
   2749        case URX_DOLLAR_MD:
   2750        case URX_RELOC_OPRND:
   2751        case URX_STO_INP_LOC:
   2752        case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
   2753        case URX_BACKREF_I:
   2754 
   2755        case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
   2756        case URX_LD_SP:
   2757            break;
   2758 
   2759        case URX_CARET:
   2760            if (atStart) {
   2761                fRXPat->fStartType = START_START;
   2762            }
   2763            break;
   2764 
   2765        case URX_CARET_M:
   2766        case URX_CARET_M_UNIX:
   2767            if (atStart) {
   2768                fRXPat->fStartType = START_LINE;
   2769            }
   2770            break;
   2771 
   2772        case URX_ONECHAR:
   2773            if (currentLen == 0) {
   2774                // This character could appear at the start of a match.
   2775                //   Add it to the set of possible starting characters.
   2776                fRXPat->fInitialChars->add(URX_VAL(op));
   2777                numInitialStrings += 2;
   2778            }
   2779            currentLen = safeIncrement(currentLen, 1);
   2780            atStart = false;
   2781            break;
   2782 
   2783 
   2784        case URX_SETREF:
   2785            if (currentLen == 0) {
   2786                int32_t  sn = URX_VAL(op);
   2787                U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
   2788                const UnicodeSet* s = static_cast<UnicodeSet*>(fRXPat->fSets->elementAt(sn));
   2789                fRXPat->fInitialChars->addAll(*s);
   2790                numInitialStrings += 2;
   2791            }
   2792            currentLen = safeIncrement(currentLen, 1);
   2793            atStart = false;
   2794            break;
   2795 
   2796        case URX_LOOP_SR_I:
   2797            // [Set]*, like a SETREF, above, in what it can match,
   2798            //  but may not match at all, so currentLen is not incremented.
   2799            if (currentLen == 0) {
   2800                int32_t  sn = URX_VAL(op);
   2801                U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
   2802                const UnicodeSet* s = static_cast<UnicodeSet*>(fRXPat->fSets->elementAt(sn));
   2803                fRXPat->fInitialChars->addAll(*s);
   2804                numInitialStrings += 2;
   2805            }
   2806            atStart = false;
   2807            break;
   2808 
   2809        case URX_LOOP_DOT_I:
   2810            if (currentLen == 0) {
   2811                // .* at the start of a pattern.
   2812                //    Any character can begin the match.
   2813                fRXPat->fInitialChars->clear();
   2814                fRXPat->fInitialChars->complement();
   2815                numInitialStrings += 2;
   2816            }
   2817            atStart = false;
   2818            break;
   2819 
   2820 
   2821        case URX_STATIC_SETREF:
   2822            if (currentLen == 0) {
   2823                int32_t  sn = URX_VAL(op);
   2824                U_ASSERT(sn>0 && sn<URX_LAST_SET);
   2825                const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[sn];
   2826                fRXPat->fInitialChars->addAll(s);
   2827                numInitialStrings += 2;
   2828            }
   2829            currentLen = safeIncrement(currentLen, 1);
   2830            atStart = false;
   2831            break;
   2832 
   2833 
   2834 
   2835        case URX_STAT_SETREF_N:
   2836            if (currentLen == 0) {
   2837                int32_t  sn = URX_VAL(op);
   2838                UnicodeSet sc;
   2839                sc.addAll(RegexStaticSets::gStaticSets->fPropSets[sn]).complement();
   2840                fRXPat->fInitialChars->addAll(sc);
   2841                numInitialStrings += 2;
   2842            }
   2843            currentLen = safeIncrement(currentLen, 1);
   2844            atStart = false;
   2845            break;
   2846 
   2847 
   2848 
   2849        case URX_BACKSLASH_D:
   2850            // Digit Char
   2851             if (currentLen == 0) {
   2852                 UnicodeSet s;
   2853                 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
   2854                 if (URX_VAL(op) != 0) {
   2855                     s.complement();
   2856                 }
   2857                 fRXPat->fInitialChars->addAll(s);
   2858                 numInitialStrings += 2;
   2859            }
   2860            currentLen = safeIncrement(currentLen, 1);
   2861            atStart = false;
   2862            break;
   2863 
   2864 
   2865        case URX_BACKSLASH_H:
   2866            // Horiz white space
   2867            if (currentLen == 0) {
   2868                UnicodeSet s;
   2869                s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
   2870                s.add(static_cast<UChar32>(9)); // Tab
   2871                if (URX_VAL(op) != 0) {
   2872                    s.complement();
   2873                }
   2874                fRXPat->fInitialChars->addAll(s);
   2875                numInitialStrings += 2;
   2876            }
   2877            currentLen = safeIncrement(currentLen, 1);
   2878            atStart = false;
   2879            break;
   2880 
   2881 
   2882        case URX_BACKSLASH_R:       // Any line ending sequence
   2883        case URX_BACKSLASH_V:       // Any line ending code point, with optional negation
   2884            if (currentLen == 0) {
   2885                UnicodeSet s;
   2886                s.add(static_cast<UChar32>(0x0a), static_cast<UChar32>(0x0d)); // add range
   2887                s.add(static_cast<UChar32>(0x85));
   2888                s.add(static_cast<UChar32>(0x2028), static_cast<UChar32>(0x2029));
   2889                if (URX_VAL(op) != 0) {
   2890                     // Complement option applies to URX_BACKSLASH_V only.
   2891                     s.complement();
   2892                }
   2893                fRXPat->fInitialChars->addAll(s);
   2894                numInitialStrings += 2;
   2895            }
   2896            currentLen = safeIncrement(currentLen, 1);
   2897            atStart = false;
   2898            break;
   2899 
   2900 
   2901 
   2902        case URX_ONECHAR_I:
   2903            // Case Insensitive Single Character.
   2904            if (currentLen == 0) {
   2905                UChar32  c = URX_VAL(op);
   2906                if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
   2907                    UnicodeSet starters(c, c);
   2908                    starters.closeOver(USET_CASE_INSENSITIVE);
   2909                    // findCaseInsensitiveStarters(c, &starters);
   2910                    //   For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
   2911                    //   The expanded folding can't match the pattern.
   2912                    fRXPat->fInitialChars->addAll(starters);
   2913                } else {
   2914                    // Char has no case variants.  Just add it as-is to the
   2915                    //   set of possible starting chars.
   2916                    fRXPat->fInitialChars->add(c);
   2917                }
   2918                numInitialStrings += 2;
   2919            }
   2920            currentLen = safeIncrement(currentLen, 1);
   2921            atStart = false;
   2922            break;
   2923 
   2924 
   2925        case URX_BACKSLASH_X:   // Grapheme Cluster.  Minimum is 1, max unbounded.
   2926        case URX_DOTANY_ALL:    // . matches one or two.
   2927        case URX_DOTANY:
   2928        case URX_DOTANY_UNIX:
   2929            if (currentLen == 0) {
   2930                // These constructs are all bad news when they appear at the start
   2931                //   of a match.  Any character can begin the match.
   2932                fRXPat->fInitialChars->clear();
   2933                fRXPat->fInitialChars->complement();
   2934                numInitialStrings += 2;
   2935            }
   2936            currentLen = safeIncrement(currentLen, 1);
   2937            atStart = false;
   2938            break;
   2939 
   2940 
   2941        case URX_JMPX:
   2942            loc++;             // Except for extra operand on URX_JMPX, same as URX_JMP.
   2943            U_FALLTHROUGH;
   2944        case URX_JMP:
   2945            {
   2946                int32_t  jmpDest = URX_VAL(op);
   2947                if (jmpDest < loc) {
   2948                    // Loop of some kind.  Can safely ignore, the worst that will happen
   2949                    //  is that we understate the true minimum length
   2950                    currentLen = forwardedLength.elementAti(loc+1);
   2951 
   2952                } else {
   2953                    // Forward jump.  Propagate the current min length to the target loc of the jump.
   2954                    U_ASSERT(jmpDest <= end+1);
   2955                    if (forwardedLength.elementAti(jmpDest) > currentLen) {
   2956                        forwardedLength.setElementAt(currentLen, jmpDest);
   2957                    }
   2958                }
   2959            }
   2960            atStart = false;
   2961            break;
   2962 
   2963        case URX_JMP_SAV:
   2964        case URX_JMP_SAV_X:
   2965            // Combo of state save to the next loc, + jmp backwards.
   2966            //   Net effect on min. length computation is nothing.
   2967            atStart = false;
   2968            break;
   2969 
   2970        case URX_BACKTRACK:
   2971            // Fails are kind of like a branch, except that the min length was
   2972            //   propagated already, by the state save.
   2973            currentLen = forwardedLength.elementAti(loc+1);
   2974            atStart = false;
   2975            break;
   2976 
   2977 
   2978        case URX_STATE_SAVE:
   2979            {
   2980                // State Save, for forward jumps, propagate the current minimum.
   2981                //             of the state save.
   2982                int32_t  jmpDest = URX_VAL(op);
   2983                if (jmpDest > loc) {
   2984                    if (currentLen < forwardedLength.elementAti(jmpDest)) {
   2985                        forwardedLength.setElementAt(currentLen, jmpDest);
   2986                    }
   2987                }
   2988            }
   2989            atStart = false;
   2990            break;
   2991 
   2992 
   2993 
   2994 
   2995        case URX_STRING:
   2996            {
   2997                loc++;
   2998                int32_t stringLenOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
   2999                int32_t stringLen   = URX_VAL(stringLenOp);
   3000                U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
   3001                U_ASSERT(stringLenOp >= 2);
   3002                if (currentLen == 0) {
   3003                    // Add the starting character of this string to the set of possible starting
   3004                    //   characters for this pattern.
   3005                    int32_t stringStartIdx = URX_VAL(op);
   3006                    UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
   3007                    fRXPat->fInitialChars->add(c);
   3008 
   3009                    // Remember this string.  After the entire pattern has been checked,
   3010                    //  if nothing else is identified that can start a match, we'll use it.
   3011                    numInitialStrings++;
   3012                    fRXPat->fInitialStringIdx = stringStartIdx;
   3013                    fRXPat->fInitialStringLen = stringLen;
   3014                }
   3015 
   3016                currentLen = safeIncrement(currentLen, stringLen);
   3017                atStart = false;
   3018            }
   3019            break;
   3020 
   3021        case URX_STRING_I:
   3022            {
   3023                // Case-insensitive string.  Unlike exact-match strings, we won't
   3024                //   attempt a string search for possible match positions.  But we
   3025                //   do update the set of possible starting characters.
   3026                loc++;
   3027                int32_t stringLenOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
   3028                int32_t stringLen   = URX_VAL(stringLenOp);
   3029                U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
   3030                U_ASSERT(stringLenOp >= 2);
   3031                if (currentLen == 0) {
   3032                    // Add the starting character of this string to the set of possible starting
   3033                    //   characters for this pattern.
   3034                    int32_t stringStartIdx = URX_VAL(op);
   3035                    UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
   3036                    UnicodeSet s;
   3037                    findCaseInsensitiveStarters(c, &s);
   3038                    fRXPat->fInitialChars->addAll(s);
   3039                    numInitialStrings += 2;  // Matching on an initial string not possible.
   3040                }
   3041                currentLen = safeIncrement(currentLen, stringLen);
   3042                atStart = false;
   3043            }
   3044            break;
   3045 
   3046        case URX_CTR_INIT:
   3047        case URX_CTR_INIT_NG:
   3048            {
   3049                // Loop Init Ops.  These don't change the min length, but they are 4 word ops
   3050                //   so location must be updated accordingly.
   3051                // Loop Init Ops.
   3052                //   If the min loop count == 0
   3053                //      move loc forwards to the end of the loop, skipping over the body.
   3054                //   If the min count is > 0,
   3055                //      continue normal processing of the body of the loop.
   3056                int32_t loopEndLoc = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc + 1));
   3057                        loopEndLoc   = URX_VAL(loopEndLoc);
   3058                int32_t minLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc + 2));
   3059                if (minLoopCount == 0) {
   3060                    // Min Loop Count of 0, treat like a forward branch and
   3061                    //   move the current minimum length up to the target
   3062                    //   (end of loop) location.
   3063                    U_ASSERT(loopEndLoc <= end+1);
   3064                    if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
   3065                        forwardedLength.setElementAt(currentLen, loopEndLoc);
   3066                    }
   3067                }
   3068                loc+=3;  // Skips over operands of CTR_INIT
   3069            }
   3070            atStart = false;
   3071            break;
   3072 
   3073 
   3074        case URX_CTR_LOOP:
   3075        case URX_CTR_LOOP_NG:
   3076            // Loop ops.
   3077            //  The jump is conditional, backwards only.
   3078            atStart = false;
   3079            break;
   3080 
   3081        case URX_LOOP_C:
   3082            // More loop ops.  These state-save to themselves.
   3083            //   don't change the minimum match
   3084            atStart = false;
   3085            break;
   3086 
   3087 
   3088        case URX_LA_START:
   3089        case URX_LB_START:
   3090            {
   3091                // Look-around.  Scan forward until the matching look-ahead end,
   3092                //   without processing the look-around block.  This is overly pessimistic.
   3093 
   3094                // Keep track of the nesting depth of look-around blocks.  Boilerplate code for
   3095                //   lookahead contains two LA_END instructions, so count goes up by two
   3096                //   for each LA_START.
   3097                int32_t  depth = (opType == URX_LA_START? 2: 1);
   3098                for (;;) {
   3099                    loc++;
   3100                    op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
   3101                    if (URX_TYPE(op) == URX_LA_START) {
   3102                        depth+=2;
   3103                    }
   3104                    if (URX_TYPE(op) == URX_LB_START) {
   3105                        depth++;
   3106                    }
   3107                    if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
   3108                        depth--;
   3109                        if (depth == 0) {
   3110                            break;
   3111                        }
   3112                    }
   3113                    if (URX_TYPE(op) == URX_STATE_SAVE) {
   3114                        // Need this because neg lookahead blocks will FAIL to outside
   3115                        //   of the block.
   3116                        int32_t  jmpDest = URX_VAL(op);
   3117                        if (jmpDest > loc) {
   3118                            if (currentLen < forwardedLength.elementAti(jmpDest)) {
   3119                                forwardedLength.setElementAt(currentLen, jmpDest);
   3120                            }
   3121                        }
   3122                    }
   3123                    U_ASSERT(loc <= end);
   3124                }
   3125            }
   3126            break;
   3127 
   3128        case URX_LA_END:
   3129        case URX_LB_CONT:
   3130        case URX_LB_END:
   3131        case URX_LBN_CONT:
   3132        case URX_LBN_END:
   3133            UPRV_UNREACHABLE_EXIT;  // Shouldn't get here.  These ops should be
   3134                                    //  consumed by the scan in URX_LA_START and LB_START
   3135        default:
   3136            UPRV_UNREACHABLE_EXIT;
   3137            }
   3138 
   3139        }
   3140 
   3141 
   3142    // We have finished walking through the ops.  Check whether some forward jump
   3143    //   propagated a shorter length to location end+1.
   3144    if (forwardedLength.elementAti(end+1) < currentLen) {
   3145        currentLen = forwardedLength.elementAti(end+1);
   3146    }
   3147 
   3148 
   3149    fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
   3150 
   3151 
   3152    // Sort out what we should check for when looking for candidate match start positions.
   3153    // In order of preference,
   3154    //     1.   Start of input text buffer.
   3155    //     2.   A literal string.
   3156    //     3.   Start of line in multi-line mode.
   3157    //     4.   A single literal character.
   3158    //     5.   A character from a set of characters.
   3159    //
   3160    if (fRXPat->fStartType == START_START) {
   3161        // Match only at the start of an input text string.
   3162        //    start type is already set.  We're done.
   3163    } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
   3164        // Match beginning only with a literal string.
   3165        UChar32  c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
   3166        U_ASSERT(fRXPat->fInitialChars->contains(c));
   3167        fRXPat->fStartType   = START_STRING;
   3168        fRXPat->fInitialChar = c;
   3169    } else if (fRXPat->fStartType == START_LINE) {
   3170        // Match at start of line in Multi-Line mode.
   3171        // Nothing to do here; everything is already set.
   3172    } else if (fRXPat->fMinMatchLen == 0) {
   3173        // Zero length match possible.  We could start anywhere.
   3174        fRXPat->fStartType = START_NO_INFO;
   3175    } else if (fRXPat->fInitialChars->size() == 1) {
   3176        // All matches begin with the same char.
   3177        fRXPat->fStartType   = START_CHAR;
   3178        fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
   3179        U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
   3180    } else if (fRXPat->fInitialChars->contains(static_cast<UChar32>(0), static_cast<UChar32>(0x10ffff)) == false &&
   3181        fRXPat->fMinMatchLen > 0) {
   3182        // Matches start with a set of character smaller than the set of all chars.
   3183        fRXPat->fStartType = START_SET;
   3184    } else {
   3185        // Matches can start with anything
   3186        fRXPat->fStartType = START_NO_INFO;
   3187    }
   3188 }
   3189 
   3190 
   3191 
   3192 //------------------------------------------------------------------------------
   3193 //
   3194 //   minMatchLength    Calculate the length of the shortest string that could
   3195 //                     match the specified pattern.
   3196 //                     Length is in 16 bit code units, not code points.
   3197 //
   3198 //                     The calculated length may not be exact.  The returned
   3199 //                     value may be shorter than the actual minimum; it must
   3200 //                     never be longer.
   3201 //
   3202 //                     start and end are the range of p-code operations to be
   3203 //                     examined.  The endpoints are included in the range.
   3204 //
   3205 //------------------------------------------------------------------------------
   3206 int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
   3207    if (U_FAILURE(*fStatus)) {
   3208        return 0;
   3209    }
   3210 
   3211    U_ASSERT(start <= end);
   3212    U_ASSERT(end < fRXPat->fCompiledPat->size());
   3213 
   3214 
   3215    int32_t    loc;
   3216    int32_t    op;
   3217    int32_t    opType;
   3218    int32_t    currentLen = 0;
   3219 
   3220 
   3221    // forwardedLength is a vector holding minimum-match-length values that
   3222    //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
   3223    //   It must be one longer than the pattern being checked because some  ops
   3224    //   will jmp to a end-of-block+1 location from within a block, and we must
   3225    //   count those when checking the block.
   3226    UVector32  forwardedLength(end+2, *fStatus);
   3227    forwardedLength.setSize(end+2);
   3228    for (loc=start; loc<=end+1; loc++) {
   3229        forwardedLength.setElementAt(INT32_MAX, loc);
   3230    }
   3231 
   3232    for (loc = start; loc<=end; loc++) {
   3233        op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
   3234        opType = URX_TYPE(op);
   3235 
   3236        // The loop is advancing linearly through the pattern.
   3237        // If the op we are now at was the destination of a branch in the pattern,
   3238        // and that path has a shorter minimum length than the current accumulated value,
   3239        // replace the current accumulated value.
   3240        // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);  // MinLength == INT32_MAX for some
   3241                                                               //   no-match-possible cases.
   3242        if (forwardedLength.elementAti(loc) < currentLen) {
   3243            currentLen = forwardedLength.elementAti(loc);
   3244            U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
   3245        }
   3246 
   3247        switch (opType) {
   3248            // Ops that don't change the total length matched
   3249        case URX_RESERVED_OP:
   3250        case URX_END:
   3251        case URX_STRING_LEN:
   3252        case URX_NOP:
   3253        case URX_START_CAPTURE:
   3254        case URX_END_CAPTURE:
   3255        case URX_BACKSLASH_B:
   3256        case URX_BACKSLASH_BU:
   3257        case URX_BACKSLASH_G:
   3258        case URX_BACKSLASH_Z:
   3259        case URX_CARET:
   3260        case URX_DOLLAR:
   3261        case URX_DOLLAR_M:
   3262        case URX_DOLLAR_D:
   3263        case URX_DOLLAR_MD:
   3264        case URX_RELOC_OPRND:
   3265        case URX_STO_INP_LOC:
   3266        case URX_CARET_M:
   3267        case URX_CARET_M_UNIX:
   3268        case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
   3269        case URX_BACKREF_I:
   3270 
   3271        case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
   3272        case URX_LD_SP:
   3273 
   3274        case URX_JMP_SAV:
   3275        case URX_JMP_SAV_X:
   3276            break;
   3277 
   3278 
   3279            // Ops that match a minimum of one character (one or two 16 bit code units.)
   3280            //
   3281        case URX_ONECHAR:
   3282        case URX_STATIC_SETREF:
   3283        case URX_STAT_SETREF_N:
   3284        case URX_SETREF:
   3285        case URX_BACKSLASH_D:
   3286        case URX_BACKSLASH_H:
   3287        case URX_BACKSLASH_R:
   3288        case URX_BACKSLASH_V:
   3289        case URX_ONECHAR_I:
   3290        case URX_BACKSLASH_X:   // Grapheme Cluster.  Minimum is 1, max unbounded.
   3291        case URX_DOTANY_ALL:    // . matches one or two.
   3292        case URX_DOTANY:
   3293        case URX_DOTANY_UNIX:
   3294            currentLen = safeIncrement(currentLen, 1);
   3295            break;
   3296 
   3297 
   3298        case URX_JMPX:
   3299            loc++;              // URX_JMPX has an extra operand, ignored here,
   3300                                //   otherwise processed identically to URX_JMP.
   3301            U_FALLTHROUGH;
   3302        case URX_JMP:
   3303            {
   3304                int32_t  jmpDest = URX_VAL(op);
   3305                if (jmpDest < loc) {
   3306                    // Loop of some kind.  Can safely ignore, the worst that will happen
   3307                    //  is that we understate the true minimum length
   3308                    currentLen = forwardedLength.elementAti(loc+1);
   3309                } else {
   3310                    // Forward jump.  Propagate the current min length to the target loc of the jump.
   3311                    U_ASSERT(jmpDest <= end+1);
   3312                    if (forwardedLength.elementAti(jmpDest) > currentLen) {
   3313                        forwardedLength.setElementAt(currentLen, jmpDest);
   3314                    }
   3315                }
   3316            }
   3317            break;
   3318 
   3319        case URX_BACKTRACK:
   3320            {
   3321                // Back-tracks are kind of like a branch, except that the min length was
   3322                //   propagated already, by the state save.
   3323                currentLen = forwardedLength.elementAti(loc+1);
   3324            }
   3325            break;
   3326 
   3327 
   3328        case URX_STATE_SAVE:
   3329            {
   3330                // State Save, for forward jumps, propagate the current minimum.
   3331                //             of the state save.
   3332                int32_t  jmpDest = URX_VAL(op);
   3333                if (jmpDest > loc) {
   3334                    if (currentLen < forwardedLength.elementAti(jmpDest)) {
   3335                        forwardedLength.setElementAt(currentLen, jmpDest);
   3336                    }
   3337                }
   3338            }
   3339            break;
   3340 
   3341 
   3342        case URX_STRING:
   3343            {
   3344                loc++;
   3345                int32_t stringLenOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
   3346                currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
   3347            }
   3348            break;
   3349 
   3350 
   3351        case URX_STRING_I:
   3352            {
   3353                loc++;
   3354                // TODO: with full case folding, matching input text may be shorter than
   3355                //       the string we have here.  More smarts could put some bounds on it.
   3356                //       Assume a min length of one for now.  A min length of zero causes
   3357                //        optimization failures for a pattern like "string"+
   3358                // currentLen += URX_VAL(stringLenOp);
   3359                currentLen = safeIncrement(currentLen, 1);
   3360            }
   3361            break;
   3362 
   3363        case URX_CTR_INIT:
   3364        case URX_CTR_INIT_NG:
   3365            {
   3366                // Loop Init Ops.
   3367                //   If the min loop count == 0
   3368                //      move loc forwards to the end of the loop, skipping over the body.
   3369                //   If the min count is > 0,
   3370                //      continue normal processing of the body of the loop.
   3371                int32_t loopEndLoc = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc + 1));
   3372                        loopEndLoc   = URX_VAL(loopEndLoc);
   3373                int32_t minLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc + 2));
   3374                if (minLoopCount == 0) {
   3375                    loc = loopEndLoc;
   3376                } else {
   3377                    loc+=3;  // Skips over operands of CTR_INIT
   3378                }
   3379            }
   3380            break;
   3381 
   3382 
   3383        case URX_CTR_LOOP:
   3384        case URX_CTR_LOOP_NG:
   3385            // Loop ops.
   3386            //  The jump is conditional, backwards only.
   3387            break;
   3388 
   3389        case URX_LOOP_SR_I:
   3390        case URX_LOOP_DOT_I:
   3391        case URX_LOOP_C:
   3392            // More loop ops.  These state-save to themselves.
   3393            //   don't change the minimum match - could match nothing at all.
   3394            break;
   3395 
   3396 
   3397        case URX_LA_START:
   3398        case URX_LB_START:
   3399            {
   3400                // Look-around.  Scan forward until the matching look-ahead end,
   3401                //   without processing the look-around block.  This is overly pessimistic for look-ahead,
   3402                //   it assumes that the look-ahead match might be zero-length.
   3403                //   TODO:  Positive lookahead could recursively do the block, then continue
   3404                //          with the longer of the block or the value coming in.  Ticket 6060
   3405                int32_t  depth = (opType == URX_LA_START? 2: 1);
   3406                for (;;) {
   3407                    loc++;
   3408                    op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
   3409                    if (URX_TYPE(op) == URX_LA_START) {
   3410                        // The boilerplate for look-ahead includes two LA_END instructions,
   3411                        //    Depth will be decremented by each one when it is seen.
   3412                        depth += 2;
   3413                    }
   3414                    if (URX_TYPE(op) == URX_LB_START) {
   3415                        depth++;
   3416                    }
   3417                    if (URX_TYPE(op) == URX_LA_END) {
   3418                        depth--;
   3419                        if (depth == 0) {
   3420                            break;
   3421                        }
   3422                    }
   3423                    if (URX_TYPE(op)==URX_LBN_END) {
   3424                        depth--;
   3425                        if (depth == 0) {
   3426                            break;
   3427                        }
   3428                    }
   3429                    if (URX_TYPE(op) == URX_STATE_SAVE) {
   3430                        // Need this because neg lookahead blocks will FAIL to outside
   3431                        //   of the block.
   3432                        int32_t  jmpDest = URX_VAL(op);
   3433                        if (jmpDest > loc) {
   3434                            if (currentLen < forwardedLength.elementAti(jmpDest)) {
   3435                                forwardedLength.setElementAt(currentLen, jmpDest);
   3436                            }
   3437                        }
   3438                    }
   3439                    U_ASSERT(loc <= end);
   3440                }
   3441            }
   3442            break;
   3443 
   3444        case URX_LA_END:
   3445        case URX_LB_CONT:
   3446        case URX_LB_END:
   3447        case URX_LBN_CONT:
   3448        case URX_LBN_END:
   3449            // Only come here if the matching URX_LA_START or URX_LB_START was not in the
   3450            //   range being sized, which happens when measuring size of look-behind blocks.
   3451            break;
   3452 
   3453        default:
   3454            UPRV_UNREACHABLE_EXIT;
   3455            }
   3456 
   3457        }
   3458 
   3459    // We have finished walking through the ops.  Check whether some forward jump
   3460    //   propagated a shorter length to location end+1.
   3461    if (forwardedLength.elementAti(end+1) < currentLen) {
   3462        currentLen = forwardedLength.elementAti(end+1);
   3463        U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
   3464    }
   3465 
   3466    return currentLen;
   3467 }
   3468 
   3469 //------------------------------------------------------------------------------
   3470 //
   3471 //   maxMatchLength    Calculate the length of the longest string that could
   3472 //                     match the specified pattern.
   3473 //                     Length is in 16 bit code units, not code points.
   3474 //
   3475 //                     The calculated length may not be exact.  The returned
   3476 //                     value may be longer than the actual maximum; it must
   3477 //                     never be shorter.
   3478 //
   3479 //                     start, end: the range of the pattern to check.
   3480 //                     end is inclusive.
   3481 //
   3482 //------------------------------------------------------------------------------
   3483 int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
   3484    if (U_FAILURE(*fStatus)) {
   3485        return 0;
   3486    }
   3487    U_ASSERT(start <= end);
   3488    U_ASSERT(end < fRXPat->fCompiledPat->size());
   3489 
   3490    int32_t    loc;
   3491    int32_t    op;
   3492    int32_t    opType;
   3493    int32_t    currentLen = 0;
   3494    UVector32  forwardedLength(end+1, *fStatus);
   3495    forwardedLength.setSize(end+1);
   3496 
   3497    for (loc=start; loc<=end; loc++) {
   3498        forwardedLength.setElementAt(0, loc);
   3499    }
   3500 
   3501    for (loc = start; loc<=end; loc++) {
   3502        op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
   3503        opType = URX_TYPE(op);
   3504 
   3505        // The loop is advancing linearly through the pattern.
   3506        // If the op we are now at was the destination of a branch in the pattern,
   3507        // and that path has a longer maximum length than the current accumulated value,
   3508        // replace the current accumulated value.
   3509        if (forwardedLength.elementAti(loc) > currentLen) {
   3510            currentLen = forwardedLength.elementAti(loc);
   3511        }
   3512 
   3513        switch (opType) {
   3514            // Ops that don't change the total length matched
   3515        case URX_RESERVED_OP:
   3516        case URX_END:
   3517        case URX_STRING_LEN:
   3518        case URX_NOP:
   3519        case URX_START_CAPTURE:
   3520        case URX_END_CAPTURE:
   3521        case URX_BACKSLASH_B:
   3522        case URX_BACKSLASH_BU:
   3523        case URX_BACKSLASH_G:
   3524        case URX_BACKSLASH_Z:
   3525        case URX_CARET:
   3526        case URX_DOLLAR:
   3527        case URX_DOLLAR_M:
   3528        case URX_DOLLAR_D:
   3529        case URX_DOLLAR_MD:
   3530        case URX_RELOC_OPRND:
   3531        case URX_STO_INP_LOC:
   3532        case URX_CARET_M:
   3533        case URX_CARET_M_UNIX:
   3534 
   3535        case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
   3536        case URX_LD_SP:
   3537 
   3538        case URX_LB_END:
   3539        case URX_LB_CONT:
   3540        case URX_LBN_CONT:
   3541        case URX_LBN_END:
   3542            break;
   3543 
   3544 
   3545            // Ops that increase that cause an unbounded increase in the length
   3546            //   of a matched string, or that increase it a hard to characterize way.
   3547            //   Call the max length unbounded, and stop further checking.
   3548        case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
   3549        case URX_BACKREF_I:
   3550        case URX_BACKSLASH_X:   // Grapheme Cluster.  Minimum is 1, max unbounded.
   3551            currentLen = INT32_MAX;
   3552            break;
   3553 
   3554 
   3555            // Ops that match a max of one character (possibly two 16 bit code units.)
   3556            //
   3557        case URX_STATIC_SETREF:
   3558        case URX_STAT_SETREF_N:
   3559        case URX_SETREF:
   3560        case URX_BACKSLASH_D:
   3561        case URX_BACKSLASH_H:
   3562        case URX_BACKSLASH_R:
   3563        case URX_BACKSLASH_V:
   3564        case URX_ONECHAR_I:
   3565        case URX_DOTANY_ALL:
   3566        case URX_DOTANY:
   3567        case URX_DOTANY_UNIX:
   3568            currentLen = safeIncrement(currentLen, 2);
   3569            break;
   3570 
   3571            // Single literal character.  Increase current max length by one or two,
   3572            //       depending on whether the char is in the supplementary range.
   3573        case URX_ONECHAR:
   3574            currentLen = safeIncrement(currentLen, 1);
   3575            if (URX_VAL(op) > 0x10000) {
   3576                currentLen = safeIncrement(currentLen, 1);
   3577            }
   3578            break;
   3579 
   3580            // Jumps.
   3581            //
   3582        case URX_JMP:
   3583        case URX_JMPX:
   3584        case URX_JMP_SAV:
   3585        case URX_JMP_SAV_X:
   3586            {
   3587                int32_t  jmpDest = URX_VAL(op);
   3588                if (jmpDest < loc) {
   3589                    // Loop of some kind.  Max match length is unbounded.
   3590                    currentLen = INT32_MAX;
   3591                } else {
   3592                    // Forward jump.  Propagate the current min length to the target loc of the jump.
   3593                    if (forwardedLength.elementAti(jmpDest) < currentLen) {
   3594                        forwardedLength.setElementAt(currentLen, jmpDest);
   3595                    }
   3596                    currentLen = 0;
   3597                }
   3598            }
   3599            break;
   3600 
   3601        case URX_BACKTRACK:
   3602            // back-tracks are kind of like a branch, except that the max length was
   3603            //   propagated already, by the state save.
   3604            currentLen = forwardedLength.elementAti(loc+1);
   3605            break;
   3606 
   3607 
   3608        case URX_STATE_SAVE:
   3609            {
   3610                // State Save, for forward jumps, propagate the current minimum.
   3611                //               of the state save.
   3612                //             For backwards jumps, they create a loop, maximum
   3613                //               match length is unbounded.
   3614                int32_t  jmpDest = URX_VAL(op);
   3615                if (jmpDest > loc) {
   3616                    if (currentLen > forwardedLength.elementAti(jmpDest)) {
   3617                        forwardedLength.setElementAt(currentLen, jmpDest);
   3618                    }
   3619                } else {
   3620                    currentLen = INT32_MAX;
   3621                }
   3622            }
   3623            break;
   3624 
   3625 
   3626 
   3627 
   3628        case URX_STRING:
   3629            {
   3630                loc++;
   3631                int32_t stringLenOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
   3632                currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
   3633                break;
   3634            }
   3635 
   3636        case URX_STRING_I:
   3637            // TODO:  This code assumes that any user string that matches will be no longer
   3638            //        than our compiled string, with case insensitive matching.
   3639            //        Our compiled string has been case-folded already.
   3640            //
   3641            //        Any matching user string will have no more code points than our
   3642            //        compiled (folded) string.  Folding may add code points, but
   3643            //        not remove them.
   3644            //
   3645            //        There is a potential problem if a supplemental code point
   3646            //        case-folds to a BMP code point.  In this case our compiled string
   3647            //        could be shorter (in code units) than a matching user string.
   3648            //
   3649            //        At this time (Unicode 6.1) there are no such characters, and this case
   3650            //        is not being handled.  A test, intltest regex/Bug9283, will fail if
   3651            //        any problematic characters are added to Unicode.
   3652            //
   3653            //        If this happens, we can make a set of the BMP chars that the
   3654            //        troublesome supplementals fold to, scan our string, and bump the
   3655            //        currentLen one extra for each that is found.
   3656            //
   3657            {
   3658                loc++;
   3659                int32_t stringLenOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
   3660                currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
   3661            }
   3662            break;
   3663 
   3664        case URX_CTR_INIT:
   3665        case URX_CTR_INIT_NG:
   3666            // For Loops, recursively call this function on the pattern for the loop body,
   3667            //   then multiply the result by the maximum loop count.
   3668            {
   3669                int32_t  loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
   3670                if (loopEndLoc == loc+4) {
   3671                    // Loop has an empty body. No affect on max match length.
   3672                    // Continue processing with code after the loop end.
   3673                    loc = loopEndLoc;
   3674                    break;
   3675                }
   3676 
   3677                int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+3));
   3678                if (maxLoopCount == -1) {
   3679                    // Unbounded Loop. No upper bound on match length.
   3680                    currentLen = INT32_MAX;
   3681                    break;
   3682                }
   3683 
   3684                U_ASSERT(loopEndLoc >= loc+4);
   3685                int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1);  // Recursive call.
   3686                int64_t updatedLen = static_cast<int64_t>(currentLen) + blockLen * maxLoopCount;
   3687                if (updatedLen >= INT32_MAX) {
   3688                    currentLen = INT32_MAX;
   3689                    break;
   3690                }
   3691                currentLen = static_cast<int32_t>(updatedLen);
   3692                loc = loopEndLoc;
   3693                break;
   3694            }
   3695 
   3696        case URX_CTR_LOOP:
   3697        case URX_CTR_LOOP_NG:
   3698            // These opcodes will be skipped over by code for URX_CTR_INIT.
   3699            // We shouldn't encounter them here.
   3700            UPRV_UNREACHABLE_EXIT;
   3701 
   3702        case URX_LOOP_SR_I:
   3703        case URX_LOOP_DOT_I:
   3704        case URX_LOOP_C:
   3705            // For anything to do with loops, make the match length unbounded.
   3706            currentLen = INT32_MAX;
   3707            break;
   3708 
   3709 
   3710 
   3711        case URX_LA_START:
   3712        case URX_LA_END:
   3713            // Look-ahead.  Just ignore, treat the look-ahead block as if
   3714            // it were normal pattern.  Gives a too-long match length,
   3715            //  but good enough for now.
   3716            break;
   3717 
   3718            // End of look-ahead ops should always be consumed by the processing at
   3719            //  the URX_LA_START op.
   3720            // UPRV_UNREACHABLE_EXIT;
   3721 
   3722        case URX_LB_START:
   3723            {
   3724                // Look-behind.  Scan forward until the matching look-around end,
   3725                //   without processing the look-behind block.
   3726                int32_t dataLoc = URX_VAL(op);
   3727                for (loc = loc + 1; loc <= end; ++loc) {
   3728                    op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
   3729                    int32_t opType = URX_TYPE(op);
   3730                    if ((opType == URX_LA_END || opType == URX_LBN_END) && (URX_VAL(op) == dataLoc)) {
   3731                        break;
   3732                    }
   3733                }
   3734                U_ASSERT(loc <= end);
   3735            }
   3736            break;
   3737 
   3738        default:
   3739            UPRV_UNREACHABLE_EXIT;
   3740        }
   3741 
   3742 
   3743        if (currentLen == INT32_MAX) {
   3744            //  The maximum length is unbounded.
   3745            //  Stop further processing of the pattern.
   3746            break;
   3747        }
   3748 
   3749    }
   3750    return currentLen;
   3751 
   3752 }
   3753 
   3754 
   3755 //------------------------------------------------------------------------------
   3756 //
   3757 //   stripNOPs    Remove any NOP operations from the compiled pattern code.
   3758 //                Extra NOPs are inserted for some constructs during the initial
   3759 //                code generation to provide locations that may be patched later.
   3760 //                Many end up unneeded, and are removed by this function.
   3761 //
   3762 //                In order to minimize the number of passes through the pattern,
   3763 //                back-reference fixup is also performed here (adjusting
   3764 //                back-reference operands to point to the correct frame offsets).
   3765 //
   3766 //------------------------------------------------------------------------------
   3767 void RegexCompile::stripNOPs() {
   3768 
   3769    if (U_FAILURE(*fStatus)) {
   3770        return;
   3771    }
   3772 
   3773    int32_t    end = fRXPat->fCompiledPat->size();
   3774    UVector32  deltas(end, *fStatus);
   3775 
   3776    // Make a first pass over the code, computing the amount that things
   3777    //   will be offset at each location in the original code.
   3778    int32_t   loc;
   3779    int32_t   d = 0;
   3780    for (loc=0; loc<end; loc++) {
   3781        deltas.addElement(d, *fStatus);
   3782        int32_t op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
   3783        if (URX_TYPE(op) == URX_NOP) {
   3784            d++;
   3785        }
   3786    }
   3787 
   3788    UnicodeString caseStringBuffer;
   3789 
   3790    // Make a second pass over the code, removing the NOPs by moving following
   3791    //  code up, and patching operands that refer to code locations that
   3792    //  are being moved.  The array of offsets from the first step is used
   3793    //  to compute the new operand values.
   3794    int32_t src;
   3795    int32_t dst = 0;
   3796    for (src=0; src<end; src++) {
   3797        int32_t op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(src));
   3798        int32_t opType = URX_TYPE(op);
   3799        switch (opType) {
   3800        case URX_NOP:
   3801            break;
   3802 
   3803        case URX_STATE_SAVE:
   3804        case URX_JMP:
   3805        case URX_CTR_LOOP:
   3806        case URX_CTR_LOOP_NG:
   3807        case URX_RELOC_OPRND:
   3808        case URX_JMPX:
   3809        case URX_JMP_SAV:
   3810        case URX_JMP_SAV_X:
   3811            // These are instructions with operands that refer to code locations.
   3812            {
   3813                int32_t  operandAddress = URX_VAL(op);
   3814                U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
   3815                int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
   3816                op = buildOp(opType, fixedOperandAddress);
   3817                fRXPat->fCompiledPat->setElementAt(op, dst);
   3818                dst++;
   3819                break;
   3820            }
   3821 
   3822        case URX_BACKREF:
   3823        case URX_BACKREF_I:
   3824            {
   3825                int32_t where = URX_VAL(op);
   3826                if (where > fRXPat->fGroupMap->size()) {
   3827                    error(U_REGEX_INVALID_BACK_REF);
   3828                    break;
   3829                }
   3830                where = fRXPat->fGroupMap->elementAti(where-1);
   3831                op    = buildOp(opType, where);
   3832                fRXPat->fCompiledPat->setElementAt(op, dst);
   3833                dst++;
   3834 
   3835                fRXPat->fNeedsAltInput = true;
   3836                break;
   3837            }
   3838        case URX_RESERVED_OP:
   3839        case URX_RESERVED_OP_N:
   3840        case URX_BACKTRACK:
   3841        case URX_END:
   3842        case URX_ONECHAR:
   3843        case URX_STRING:
   3844        case URX_STRING_LEN:
   3845        case URX_START_CAPTURE:
   3846        case URX_END_CAPTURE:
   3847        case URX_STATIC_SETREF:
   3848        case URX_STAT_SETREF_N:
   3849        case URX_SETREF:
   3850        case URX_DOTANY:
   3851        case URX_FAIL:
   3852        case URX_BACKSLASH_B:
   3853        case URX_BACKSLASH_BU:
   3854        case URX_BACKSLASH_G:
   3855        case URX_BACKSLASH_X:
   3856        case URX_BACKSLASH_Z:
   3857        case URX_DOTANY_ALL:
   3858        case URX_BACKSLASH_D:
   3859        case URX_CARET:
   3860        case URX_DOLLAR:
   3861        case URX_CTR_INIT:
   3862        case URX_CTR_INIT_NG:
   3863        case URX_DOTANY_UNIX:
   3864        case URX_STO_SP:
   3865        case URX_LD_SP:
   3866        case URX_STO_INP_LOC:
   3867        case URX_LA_START:
   3868        case URX_LA_END:
   3869        case URX_ONECHAR_I:
   3870        case URX_STRING_I:
   3871        case URX_DOLLAR_M:
   3872        case URX_CARET_M:
   3873        case URX_CARET_M_UNIX:
   3874        case URX_LB_START:
   3875        case URX_LB_CONT:
   3876        case URX_LB_END:
   3877        case URX_LBN_CONT:
   3878        case URX_LBN_END:
   3879        case URX_LOOP_SR_I:
   3880        case URX_LOOP_DOT_I:
   3881        case URX_LOOP_C:
   3882        case URX_DOLLAR_D:
   3883        case URX_DOLLAR_MD:
   3884        case URX_BACKSLASH_H:
   3885        case URX_BACKSLASH_R:
   3886        case URX_BACKSLASH_V:
   3887            // These instructions are unaltered by the relocation.
   3888            fRXPat->fCompiledPat->setElementAt(op, dst);
   3889            dst++;
   3890            break;
   3891 
   3892        default:
   3893            // Some op is unaccounted for.
   3894            UPRV_UNREACHABLE_EXIT;
   3895        }
   3896    }
   3897 
   3898    fRXPat->fCompiledPat->setSize(dst);
   3899 }
   3900 
   3901 
   3902 
   3903 
   3904 //------------------------------------------------------------------------------
   3905 //
   3906 //  Error         Report a rule parse error.
   3907 //                Only report it if no previous error has been recorded.
   3908 //
   3909 //------------------------------------------------------------------------------
   3910 void RegexCompile::error(UErrorCode e) {
   3911    if (U_SUCCESS(*fStatus) || e == U_MEMORY_ALLOCATION_ERROR) {
   3912        *fStatus = e;
   3913        // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
   3914        // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
   3915        // int64_t. If the values of the latter are out of range for the former,
   3916        // set them to the appropriate "field not supported" values.
   3917        if (fLineNum > 0x7FFFFFFF) {
   3918            fParseErr->line   = 0;
   3919            fParseErr->offset = -1;
   3920        } else if (fCharNum > 0x7FFFFFFF) {
   3921            fParseErr->line = static_cast<int32_t>(fLineNum);
   3922            fParseErr->offset = -1;
   3923        } else {
   3924            fParseErr->line = static_cast<int32_t>(fLineNum);
   3925            fParseErr->offset = static_cast<int32_t>(fCharNum);
   3926        }
   3927 
   3928        UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
   3929 
   3930        // Fill in the context.
   3931        //   Note: extractBetween() pins supplied indices to the string bounds.
   3932        uprv_memset(fParseErr->preContext,  0, sizeof(fParseErr->preContext));
   3933        uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
   3934        utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
   3935        utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
   3936    }
   3937 }
   3938 
   3939 
   3940 //
   3941 //  Assorted Unicode character constants.
   3942 //     Numeric because there is no portable way to enter them as literals.
   3943 //     (Think EBCDIC).
   3944 //
   3945 static const char16_t   chCR        = 0x0d;      // New lines, for terminating comments.
   3946 static const char16_t   chLF        = 0x0a;      // Line Feed
   3947 static const char16_t   chPound     = 0x23;      // '#', introduces a comment.
   3948 static const char16_t   chDigit0    = 0x30;      // '0'
   3949 static const char16_t   chDigit7    = 0x37;      // '9'
   3950 static const char16_t   chColon     = 0x3A;      // ':'
   3951 static const char16_t   chE         = 0x45;      // 'E'
   3952 static const char16_t   chQ         = 0x51;      // 'Q'
   3953 //static const char16_t   chN         = 0x4E;      // 'N'
   3954 static const char16_t   chP         = 0x50;      // 'P'
   3955 static const char16_t   chBackSlash = 0x5c;      // '\'  introduces a char escape
   3956 //static const char16_t   chLBracket  = 0x5b;      // '['
   3957 static const char16_t   chRBracket  = 0x5d;      // ']'
   3958 static const char16_t   chUp        = 0x5e;      // '^'
   3959 static const char16_t   chLowerP    = 0x70;
   3960 static const char16_t   chLBrace    = 0x7b;      // '{'
   3961 static const char16_t   chRBrace    = 0x7d;      // '}'
   3962 static const char16_t   chNEL       = 0x85;      //    NEL newline variant
   3963 static const char16_t   chLS        = 0x2028;    //    Unicode Line Separator
   3964 
   3965 
   3966 //------------------------------------------------------------------------------
   3967 //
   3968 //  nextCharLL    Low Level Next Char from the regex pattern.
   3969 //                Get a char from the string, keep track of input position
   3970 //                     for error reporting.
   3971 //
   3972 //------------------------------------------------------------------------------
   3973 UChar32  RegexCompile::nextCharLL() {
   3974    UChar32       ch;
   3975 
   3976    if (fPeekChar != -1) {
   3977        ch = fPeekChar;
   3978        fPeekChar = -1;
   3979        return ch;
   3980    }
   3981 
   3982    // assume we're already in the right place
   3983    ch = UTEXT_NEXT32(fRXPat->fPattern);
   3984    if (ch == U_SENTINEL) {
   3985        return ch;
   3986    }
   3987 
   3988    if (ch == chCR ||
   3989        ch == chNEL ||
   3990        ch == chLS   ||
   3991        (ch == chLF && fLastChar != chCR)) {
   3992        // Character is starting a new line.  Bump up the line number, and
   3993        //  reset the column to 0.
   3994        fLineNum++;
   3995        fCharNum=0;
   3996    }
   3997    else {
   3998        // Character is not starting a new line.  Except in the case of a
   3999        //   LF following a CR, increment the column position.
   4000        if (ch != chLF) {
   4001            fCharNum++;
   4002        }
   4003    }
   4004    fLastChar = ch;
   4005    return ch;
   4006 }
   4007 
   4008 //------------------------------------------------------------------------------
   4009 //
   4010 //   peekCharLL    Low Level Character Scanning, sneak a peek at the next
   4011 //                 character without actually getting it.
   4012 //
   4013 //------------------------------------------------------------------------------
   4014 UChar32  RegexCompile::peekCharLL() {
   4015    if (fPeekChar == -1) {
   4016        fPeekChar = nextCharLL();
   4017    }
   4018    return fPeekChar;
   4019 }
   4020 
   4021 
   4022 //------------------------------------------------------------------------------
   4023 //
   4024 //   nextChar     for pattern scanning.  At this level, we handle stripping
   4025 //                out comments and processing some backslash character escapes.
   4026 //                The rest of the pattern grammar is handled at the next level up.
   4027 //
   4028 //------------------------------------------------------------------------------
   4029 void RegexCompile::nextChar(RegexPatternChar &c) {
   4030  tailRecursion:
   4031    fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
   4032    c.fChar    = nextCharLL();
   4033    c.fQuoted  = false;
   4034 
   4035    if (fQuoteMode) {
   4036        c.fQuoted = true;
   4037        if ((c.fChar == chBackSlash && peekCharLL() == chE && ((fModeFlags & UREGEX_LITERAL) == 0)) ||
   4038            c.fChar == static_cast<UChar32>(-1)) {
   4039            fQuoteMode = false;  //  Exit quote mode,
   4040            nextCharLL();        // discard the E
   4041            // nextChar(c);      // recurse to get the real next char
   4042            goto tailRecursion;  // Note: fuzz testing produced testcases that
   4043                                 //       resulted in stack overflow here.
   4044        }
   4045    }
   4046    else if (fInBackslashQuote) {
   4047        // The current character immediately follows a '\'
   4048        // Don't check for any further escapes, just return it as-is.
   4049        // Don't set c.fQuoted, because that would prevent the state machine from
   4050        //    dispatching on the character.
   4051        fInBackslashQuote = false;
   4052    }
   4053    else
   4054    {
   4055        // We are not in a \Q quoted region \E of the source.
   4056        //
   4057        if (fModeFlags & UREGEX_COMMENTS) {
   4058            //
   4059            // We are in free-spacing and comments mode.
   4060            //  Scan through any white space and comments, until we
   4061            //  reach a significant character or the end of input.
   4062            for (;;) {
   4063                if (c.fChar == static_cast<UChar32>(-1)) {
   4064                    break;     // End of Input
   4065                }
   4066                if  (c.fChar == chPound && fEOLComments) {
   4067                    // Start of a comment.  Consume the rest of it, until EOF or a new line
   4068                    for (;;) {
   4069                        c.fChar = nextCharLL();
   4070                        if (c.fChar == static_cast<UChar32>(-1) || // EOF
   4071                            c.fChar == chCR        ||
   4072                            c.fChar == chLF        ||
   4073                            c.fChar == chNEL       ||
   4074                            c.fChar == chLS)       {
   4075                            break;
   4076                        }
   4077                    }
   4078                }
   4079                // TODO:  check what Java & Perl do with non-ASCII white spaces.  Ticket 6061.
   4080                if (PatternProps::isWhiteSpace(c.fChar) == false) {
   4081                    break;
   4082                }
   4083                c.fChar = nextCharLL();
   4084            }
   4085        }
   4086 
   4087        //
   4088        //  check for backslash escaped characters.
   4089        //
   4090        if (c.fChar == chBackSlash) {
   4091            int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
   4092            if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
   4093                //
   4094                // A '\' sequence that is handled by ICU's standard unescapeAt function.
   4095                //   Includes \uxxxx, \n, \r, many others.
   4096                //   Return the single equivalent character.
   4097                //
   4098                nextCharLL();                 // get & discard the peeked char.
   4099                c.fQuoted = true;
   4100 
   4101                if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
   4102                    int32_t endIndex = static_cast<int32_t>(pos);
   4103                    c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, static_cast<int32_t>(fPatternLength), const_cast<char16_t*>(fRXPat->fPattern->chunkContents));
   4104 
   4105                    if (endIndex == pos) {
   4106                        error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   4107                    }
   4108                    fCharNum += endIndex - pos;
   4109                    UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
   4110                } else {
   4111                    int32_t offset = 0;
   4112                    struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
   4113 
   4114                    UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
   4115                    c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
   4116 
   4117                    if (offset == 0) {
   4118                        error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   4119                    } else if (context.lastOffset == offset) {
   4120                        UTEXT_PREVIOUS32(fRXPat->fPattern);
   4121                    } else if (context.lastOffset != offset-1) {
   4122                        utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1);
   4123                    }
   4124                    fCharNum += offset;
   4125                }
   4126            }
   4127            else if (peekCharLL() == chDigit0) {
   4128                //  Octal Escape, using Java Regexp Conventions
   4129                //    which are \0 followed by 1-3 octal digits.
   4130                //    Different from ICU Unescape handling of Octal, which does not
   4131                //    require the leading 0.
   4132                //  Java also has the convention of only consuming 2 octal digits if
   4133                //    the three digit number would be > 0xff
   4134                //
   4135                c.fChar = 0;
   4136                nextCharLL();    // Consume the initial 0.
   4137                int index;
   4138                for (index=0; index<3; index++) {
   4139                    int32_t ch = peekCharLL();
   4140                    if (ch<chDigit0 || ch>chDigit7) {
   4141                        if (index==0) {
   4142                           // \0 is not followed by any octal digits.
   4143                           error(U_REGEX_BAD_ESCAPE_SEQUENCE);
   4144                        }
   4145                        break;
   4146                    }
   4147                    c.fChar <<= 3;
   4148                    c.fChar += ch&7;
   4149                    if (c.fChar <= 255) {
   4150                        nextCharLL();
   4151                    } else {
   4152                        // The last digit made the number too big.  Forget we saw it.
   4153                        c.fChar >>= 3;
   4154                    }
   4155                }
   4156                c.fQuoted = true;
   4157            }
   4158            else if (peekCharLL() == chQ) {
   4159                //  "\Q"  enter quote mode, which will continue until "\E"
   4160                fQuoteMode = true;
   4161                nextCharLL();        // discard the 'Q'.
   4162                // nextChar(c);      // recurse to get the real next char.
   4163                goto tailRecursion;  // Note: fuzz testing produced test cases that
   4164                //                            resulted in stack overflow here.
   4165            }
   4166            else
   4167            {
   4168                // We are in a '\' escape that will be handled by the state table scanner.
   4169                // Just return the backslash, but remember that the following char is to
   4170                //  be taken literally.
   4171                fInBackslashQuote = true;
   4172            }
   4173        }
   4174    }
   4175 
   4176    // re-enable # to end-of-line comments, in case they were disabled.
   4177    // They are disabled by the parser upon seeing '(?', but this lasts for
   4178    //  the fetching of the next character only.
   4179    fEOLComments = true;
   4180 
   4181    // putc(c.fChar, stdout);
   4182 }
   4183 
   4184 
   4185 
   4186 //------------------------------------------------------------------------------
   4187 //
   4188 //  scanNamedChar
   4189 //            Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
   4190 //
   4191 //             The scan position will be at the 'N'.  On return
   4192 //             the scan position should be just after the '}'
   4193 //
   4194 //             Return the UChar32
   4195 //
   4196 //------------------------------------------------------------------------------
   4197 UChar32  RegexCompile::scanNamedChar() {
   4198    if (U_FAILURE(*fStatus)) {
   4199        return 0;
   4200    }
   4201 
   4202    nextChar(fC);
   4203    if (fC.fChar != chLBrace) {
   4204        error(U_REGEX_PROPERTY_SYNTAX);
   4205        return 0;
   4206    }
   4207 
   4208    UnicodeString  charName;
   4209    for (;;) {
   4210        nextChar(fC);
   4211        if (fC.fChar == chRBrace) {
   4212            break;
   4213        }
   4214        if (fC.fChar == -1) {
   4215            error(U_REGEX_PROPERTY_SYNTAX);
   4216            return 0;
   4217        }
   4218        charName.append(fC.fChar);
   4219    }
   4220 
   4221    char name[100];
   4222    if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
   4223         static_cast<uint32_t>(charName.length()) >= sizeof(name)) {
   4224        // All Unicode character names have only invariant characters.
   4225        // The API to get a character, given a name, accepts only char *, forcing us to convert,
   4226        //   which requires this error check
   4227        error(U_REGEX_PROPERTY_SYNTAX);
   4228        return 0;
   4229    }
   4230    charName.extract(0, charName.length(), name, sizeof(name), US_INV);
   4231 
   4232    UChar32  theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
   4233    if (U_FAILURE(*fStatus)) {
   4234        error(U_REGEX_PROPERTY_SYNTAX);
   4235    }
   4236 
   4237    nextChar(fC);      // Continue overall regex pattern processing with char after the '}'
   4238    return theChar;
   4239 }
   4240 
   4241 //------------------------------------------------------------------------------
   4242 //
   4243 //  scanProp   Construct a UnicodeSet from the text at the current scan
   4244 //             position, which will be of the form \p{whaterver}
   4245 //
   4246 //             The scan position will be at the 'p' or 'P'.  On return
   4247 //             the scan position should be just after the '}'
   4248 //
   4249 //             Return a UnicodeSet, constructed from the \P pattern,
   4250 //             or nullptr if the pattern is invalid.
   4251 //
   4252 //------------------------------------------------------------------------------
   4253 UnicodeSet *RegexCompile::scanProp() {
   4254    UnicodeSet    *uset = nullptr;
   4255 
   4256    if (U_FAILURE(*fStatus)) {
   4257        return nullptr;
   4258    }
   4259    (void)chLowerP;   // Suppress compiler unused variable warning.
   4260    U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
   4261    UBool negated = (fC.fChar == chP);
   4262 
   4263    UnicodeString propertyName;
   4264    nextChar(fC);
   4265    if (fC.fChar != chLBrace) {
   4266        error(U_REGEX_PROPERTY_SYNTAX);
   4267        return nullptr;
   4268    }
   4269    for (;;) {
   4270        nextChar(fC);
   4271        if (fC.fChar == chRBrace) {
   4272            break;
   4273        }
   4274        if (fC.fChar == -1) {
   4275            // Hit the end of the input string without finding the closing '}'
   4276            error(U_REGEX_PROPERTY_SYNTAX);
   4277            return nullptr;
   4278        }
   4279        propertyName.append(fC.fChar);
   4280    }
   4281    uset = createSetForProperty(propertyName, negated);
   4282    nextChar(fC);    // Move input scan to position following the closing '}'
   4283    return uset;
   4284 }
   4285 
   4286 //------------------------------------------------------------------------------
   4287 //
   4288 //  scanPosixProp   Construct a UnicodeSet from the text at the current scan
   4289 //             position, which is expected be of the form [:property expression:]
   4290 //
   4291 //             The scan position will be at the opening ':'.  On return
   4292 //             the scan position must be on the closing ']'
   4293 //
   4294 //             Return a UnicodeSet constructed from the pattern,
   4295 //             or nullptr if this is not a valid POSIX-style set expression.
   4296 //             If not a property expression, restore the initial scan position
   4297 //                (to the opening ':')
   4298 //
   4299 //               Note:  the opening '[:' is not sufficient to guarantee that
   4300 //                      this is a [:property:] expression.
   4301 //                      [:'+=,] is a perfectly good ordinary set expression that
   4302 //                              happens to include ':' as one of its characters.
   4303 //
   4304 //------------------------------------------------------------------------------
   4305 UnicodeSet *RegexCompile::scanPosixProp() {
   4306    UnicodeSet    *uset = nullptr;
   4307 
   4308    if (U_FAILURE(*fStatus)) {
   4309        return nullptr;
   4310    }
   4311 
   4312    U_ASSERT(fC.fChar == chColon);
   4313 
   4314    // Save the scanner state.
   4315    // TODO:  move this into the scanner, with the state encapsulated in some way.  Ticket 6062
   4316    int64_t     savedScanIndex        = fScanIndex;
   4317    int64_t     savedNextIndex        = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
   4318    UBool       savedQuoteMode        = fQuoteMode;
   4319    UBool       savedInBackslashQuote = fInBackslashQuote;
   4320    UBool       savedEOLComments      = fEOLComments;
   4321    int64_t     savedLineNum          = fLineNum;
   4322    int64_t     savedCharNum          = fCharNum;
   4323    UChar32     savedLastChar         = fLastChar;
   4324    UChar32     savedPeekChar         = fPeekChar;
   4325    RegexPatternChar savedfC          = fC;
   4326 
   4327    // Scan for a closing ].   A little tricky because there are some perverse
   4328    //   edge cases possible.  "[:abc\Qdef:] \E]"  is a valid non-property expression,
   4329    //   ending on the second closing ].
   4330 
   4331    UnicodeString propName;
   4332    UBool         negated  = false;
   4333 
   4334    // Check for and consume the '^' in a negated POSIX property, e.g.  [:^Letter:]
   4335    nextChar(fC);
   4336    if (fC.fChar == chUp) {
   4337       negated = true;
   4338       nextChar(fC);
   4339    }
   4340 
   4341    // Scan for the closing ":]", collecting the property name along the way.
   4342    UBool  sawPropSetTerminator = false;
   4343    for (;;) {
   4344        propName.append(fC.fChar);
   4345        nextChar(fC);
   4346        if (fC.fQuoted || fC.fChar == -1) {
   4347            // Escaped characters or end of input - either says this isn't a [:Property:]
   4348            break;
   4349        }
   4350        if (fC.fChar == chColon) {
   4351            nextChar(fC);
   4352            if (fC.fChar == chRBracket) {
   4353                sawPropSetTerminator = true;
   4354            }
   4355            break;
   4356        }
   4357    }
   4358 
   4359    if (sawPropSetTerminator) {
   4360        uset = createSetForProperty(propName, negated);
   4361    }
   4362    else
   4363    {
   4364        // No closing ":]".
   4365        //  Restore the original scan position.
   4366        //  The main scanner will retry the input as a normal set expression,
   4367        //    not a [:Property:] expression.
   4368        fScanIndex        = savedScanIndex;
   4369        fQuoteMode        = savedQuoteMode;
   4370        fInBackslashQuote = savedInBackslashQuote;
   4371        fEOLComments      = savedEOLComments;
   4372        fLineNum          = savedLineNum;
   4373        fCharNum          = savedCharNum;
   4374        fLastChar         = savedLastChar;
   4375        fPeekChar         = savedPeekChar;
   4376        fC                = savedfC;
   4377        UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
   4378    }
   4379    return uset;
   4380 }
   4381 
   4382 static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
   4383    set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
   4384    addCategory(set, U_GC_CF_MASK, ec);
   4385 }
   4386 
   4387 //
   4388 //  Create a Unicode Set from a Unicode Property expression.
   4389 //     This is common code underlying both \p{...} and [:...:] expressions.
   4390 //     Includes trying the Java "properties" that aren't supported as
   4391 //     normal ICU UnicodeSet properties
   4392 //
   4393 UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
   4394 
   4395    if (U_FAILURE(*fStatus)) {
   4396        return nullptr;
   4397    }
   4398    LocalPointer<UnicodeSet> set;
   4399    UErrorCode status = U_ZERO_ERROR;
   4400 
   4401    do {      // non-loop, exists to allow breaks from the block.
   4402        //
   4403        //  First try the property as we received it
   4404        //
   4405        UnicodeString   setExpr;
   4406        uint32_t        usetFlags = 0;
   4407        setExpr.append(u"[\\p{", -1);
   4408        setExpr.append(propName);
   4409        setExpr.append(u"}]", -1);
   4410        if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
   4411            usetFlags |= USET_CASE_INSENSITIVE;
   4412        }
   4413        set.adoptInsteadAndCheckErrorCode(new UnicodeSet(setExpr, usetFlags, nullptr, status), status);
   4414        if (U_SUCCESS(status) || status == U_MEMORY_ALLOCATION_ERROR) {
   4415            break;
   4416        }
   4417 
   4418        //
   4419        //  The incoming property wasn't directly recognized by ICU.
   4420 
   4421        //  Check [:word:] and [:all:]. These are not recognized as a properties by ICU UnicodeSet.
   4422        //     Java accepts 'word' with mixed case.
   4423        //     Java accepts 'all' only in all lower case.
   4424 
   4425        status = U_ZERO_ERROR;
   4426        if (propName.caseCompare(u"word", -1, 0) == 0) {
   4427            set.adoptInsteadAndCheckErrorCode(
   4428                RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].cloneAsThawed(), status);
   4429            break;
   4430        }
   4431        if (propName.compare(u"all", -1) == 0) {
   4432            set.adoptInsteadAndCheckErrorCode(new UnicodeSet(0, 0x10ffff), status);
   4433            break;
   4434        }
   4435 
   4436 
   4437        //    Do Java InBlock expressions
   4438        //
   4439        UnicodeString mPropName = propName;
   4440        if (mPropName.startsWith(u"In", 2) && mPropName.length() >= 3) {
   4441            status = U_ZERO_ERROR;
   4442            set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
   4443            if (U_FAILURE(status)) {
   4444                break;
   4445            }
   4446            UnicodeString blockName(mPropName, 2);  // Property with the leading "In" removed.
   4447            set->applyPropertyAlias(UnicodeString(u"Block"), blockName, status);
   4448            break;
   4449        }
   4450 
   4451        //  Check for the Java form "IsBooleanPropertyValue", which we will recast
   4452        //  as "BooleanPropertyValue". The property value can be either a
   4453        //  a General Category or a Script Name.
   4454 
   4455        if (propName.startsWith(u"Is", 2) && propName.length()>=3) {
   4456            mPropName.remove(0, 2);      // Strip the "Is"
   4457            if (mPropName.indexOf(u'=') >= 0) {
   4458                // Reject any "Is..." property expression containing an '=', that is,
   4459                // any non-binary property expression.
   4460                status = U_REGEX_PROPERTY_SYNTAX;
   4461                break;
   4462            }
   4463 
   4464            if (mPropName.caseCompare(u"assigned", -1, 0) == 0) {
   4465                mPropName.setTo(u"unassigned", -1);
   4466                negated = !negated;
   4467            } else if (mPropName.caseCompare(u"TitleCase", -1, 0) == 0) {
   4468                mPropName.setTo(u"Titlecase_Letter", -1);
   4469            }
   4470 
   4471            mPropName.insert(0, u"[\\p{", -1);
   4472            mPropName.append(u"}]", -1);
   4473            set.adoptInsteadAndCheckErrorCode(new UnicodeSet(mPropName, *fStatus), status);
   4474 
   4475            if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
   4476                set->closeOver(USET_CASE_INSENSITIVE);
   4477            }
   4478            break;
   4479 
   4480        }
   4481 
   4482        if (propName.startsWith(u"java", -1)) {
   4483            status = U_ZERO_ERROR;
   4484            set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
   4485            if (U_FAILURE(status)) {
   4486                break;
   4487            }
   4488            //
   4489            //  Try the various Java specific properties.
   4490            //   These all begin with "java"
   4491            //
   4492            if (propName.compare(u"javaDefined", -1) == 0) {
   4493                addCategory(set.getAlias(), U_GC_CN_MASK, status);
   4494                set->complement();
   4495            }
   4496            else if (propName.compare(u"javaDigit", -1) == 0) {
   4497                addCategory(set.getAlias(), U_GC_ND_MASK, status);
   4498            }
   4499            else if (propName.compare(u"javaIdentifierIgnorable", -1) == 0) {
   4500                addIdentifierIgnorable(set.getAlias(), status);
   4501            }
   4502            else if (propName.compare(u"javaISOControl", -1) == 0) {
   4503                set->add(0, 0x1F).add(0x7F, 0x9F);
   4504            }
   4505            else if (propName.compare(u"javaJavaIdentifierPart", -1) == 0) {
   4506                addCategory(set.getAlias(), U_GC_L_MASK, status);
   4507                addCategory(set.getAlias(), U_GC_SC_MASK, status);
   4508                addCategory(set.getAlias(), U_GC_PC_MASK, status);
   4509                addCategory(set.getAlias(), U_GC_ND_MASK, status);
   4510                addCategory(set.getAlias(), U_GC_NL_MASK, status);
   4511                addCategory(set.getAlias(), U_GC_MC_MASK, status);
   4512                addCategory(set.getAlias(), U_GC_MN_MASK, status);
   4513                addIdentifierIgnorable(set.getAlias(), status);
   4514            }
   4515            else if (propName.compare(u"javaJavaIdentifierStart", -1) == 0) {
   4516                addCategory(set.getAlias(), U_GC_L_MASK, status);
   4517                addCategory(set.getAlias(), U_GC_NL_MASK, status);
   4518                addCategory(set.getAlias(), U_GC_SC_MASK, status);
   4519                addCategory(set.getAlias(), U_GC_PC_MASK, status);
   4520            }
   4521            else if (propName.compare(u"javaLetter", -1) == 0) {
   4522                addCategory(set.getAlias(), U_GC_L_MASK, status);
   4523            }
   4524            else if (propName.compare(u"javaLetterOrDigit", -1) == 0) {
   4525                addCategory(set.getAlias(), U_GC_L_MASK, status);
   4526                addCategory(set.getAlias(), U_GC_ND_MASK, status);
   4527            }
   4528            else if (propName.compare(u"javaLowerCase", -1) == 0) {
   4529                addCategory(set.getAlias(), U_GC_LL_MASK, status);
   4530            }
   4531            else if (propName.compare(u"javaMirrored", -1) == 0) {
   4532                set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, status);
   4533            }
   4534            else if (propName.compare(u"javaSpaceChar", -1) == 0) {
   4535                addCategory(set.getAlias(), U_GC_Z_MASK, status);
   4536            }
   4537            else if (propName.compare(u"javaSupplementaryCodePoint", -1) == 0) {
   4538                set->add(0x10000, UnicodeSet::MAX_VALUE);
   4539            }
   4540            else if (propName.compare(u"javaTitleCase", -1) == 0) {
   4541                addCategory(set.getAlias(), U_GC_LT_MASK, status);
   4542            }
   4543            else if (propName.compare(u"javaUnicodeIdentifierStart", -1) == 0) {
   4544                addCategory(set.getAlias(), U_GC_L_MASK, status);
   4545                addCategory(set.getAlias(), U_GC_NL_MASK, status);
   4546            }
   4547            else if (propName.compare(u"javaUnicodeIdentifierPart", -1) == 0) {
   4548                addCategory(set.getAlias(), U_GC_L_MASK, status);
   4549                addCategory(set.getAlias(), U_GC_PC_MASK, status);
   4550                addCategory(set.getAlias(), U_GC_ND_MASK, status);
   4551                addCategory(set.getAlias(), U_GC_NL_MASK, status);
   4552                addCategory(set.getAlias(), U_GC_MC_MASK, status);
   4553                addCategory(set.getAlias(), U_GC_MN_MASK, status);
   4554                addIdentifierIgnorable(set.getAlias(), status);
   4555            }
   4556            else if (propName.compare(u"javaUpperCase", -1) == 0) {
   4557                addCategory(set.getAlias(), U_GC_LU_MASK, status);
   4558            }
   4559            else if (propName.compare(u"javaValidCodePoint", -1) == 0) {
   4560                set->add(0, UnicodeSet::MAX_VALUE);
   4561            }
   4562            else if (propName.compare(u"javaWhitespace", -1) == 0) {
   4563                addCategory(set.getAlias(), U_GC_Z_MASK, status);
   4564                set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
   4565                set->add(9, 0x0d).add(0x1c, 0x1f);
   4566            } else {
   4567                status = U_REGEX_PROPERTY_SYNTAX;
   4568            }
   4569 
   4570            if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
   4571                set->closeOver(USET_CASE_INSENSITIVE);
   4572            }
   4573            break;
   4574        }
   4575 
   4576        // Unrecognized property. ICU didn't like it as it was, and none of the Java compatibility
   4577        // extensions matched it.
   4578        status = U_REGEX_PROPERTY_SYNTAX;
   4579    } while (false);   // End of do loop block. Code above breaks out of the block on success or hard failure.
   4580 
   4581    if (U_SUCCESS(status)) {
   4582        // ICU 70 adds emoji properties of strings, but as long as Java does not say how to
   4583        // deal with properties of strings and character classes with strings, we ignore them.
   4584        // Just in case something downstream might stumble over the strings,
   4585        // we remove them from the set.
   4586        // Note that when we support strings, the complement of a property (as with \P)
   4587        // should be implemented as .complement().removeAllStrings() (code point complement).
   4588        set->removeAllStrings();
   4589        U_ASSERT(set.isValid());
   4590        if (negated) {
   4591            set->complement();
   4592        }
   4593        return set.orphan();
   4594    } else {
   4595        if (status == U_ILLEGAL_ARGUMENT_ERROR) {
   4596            status = U_REGEX_PROPERTY_SYNTAX;
   4597        }
   4598        error(status);
   4599        return nullptr;
   4600    }
   4601 }
   4602 
   4603 
   4604 //
   4605 //  SetEval   Part of the evaluation of [set expressions].
   4606 //            Perform any pending (stacked) operations with precedence
   4607 //            equal or greater to that of the next operator encountered
   4608 //            in the expression.
   4609 //
   4610 void RegexCompile::setEval(int32_t nextOp) {
   4611    UnicodeSet *rightOperand = nullptr;
   4612    UnicodeSet *leftOperand  = nullptr;
   4613    for (;;) {
   4614        U_ASSERT(fSetOpStack.empty()==false);
   4615        int32_t pendingSetOperation = fSetOpStack.peeki();
   4616        if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
   4617            break;
   4618        }
   4619        fSetOpStack.popi();
   4620        U_ASSERT(fSetStack.empty() == false);
   4621        rightOperand = static_cast<UnicodeSet*>(fSetStack.peek());
   4622        // ICU 70 adds emoji properties of strings, but createSetForProperty() removes all strings
   4623        // (see comments there).
   4624        // We also do not yet support string literals in character classes,
   4625        // so there should not be any strings.
   4626        // Note that when we support strings, the complement of a set (as with ^ or \P)
   4627        // should be implemented as .complement().removeAllStrings() (code point complement).
   4628        U_ASSERT(!rightOperand->hasStrings());
   4629        switch (pendingSetOperation) {
   4630            case setNegation:
   4631                rightOperand->complement();
   4632                break;
   4633            case setCaseClose:
   4634                // TODO: need a simple close function.  Ticket 6065
   4635                rightOperand->closeOver(USET_CASE_INSENSITIVE);
   4636                rightOperand->removeAllStrings();
   4637                break;
   4638            case setDifference1:
   4639            case setDifference2:
   4640                fSetStack.pop();
   4641                leftOperand = static_cast<UnicodeSet*>(fSetStack.peek());
   4642                leftOperand->removeAll(*rightOperand);
   4643                delete rightOperand;
   4644                break;
   4645            case setIntersection1:
   4646            case setIntersection2:
   4647                fSetStack.pop();
   4648                leftOperand = static_cast<UnicodeSet*>(fSetStack.peek());
   4649                leftOperand->retainAll(*rightOperand);
   4650                delete rightOperand;
   4651                break;
   4652            case setUnion:
   4653                fSetStack.pop();
   4654                leftOperand = static_cast<UnicodeSet*>(fSetStack.peek());
   4655                leftOperand->addAll(*rightOperand);
   4656                delete rightOperand;
   4657                break;
   4658            default:
   4659                UPRV_UNREACHABLE_EXIT;
   4660            }
   4661        }
   4662    }
   4663 
   4664 void RegexCompile::setPushOp(int32_t op) {
   4665    setEval(op);
   4666    fSetOpStack.push(op, *fStatus);
   4667    LocalPointer<UnicodeSet> lpSet(new UnicodeSet(), *fStatus);
   4668    fSetStack.push(lpSet.orphan(), *fStatus);
   4669 }
   4670 
   4671 U_NAMESPACE_END
   4672 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE