[ tor-browser ].git.dasho

rbbiscan.cpp (47842B)
      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 //
      4 //  file:  rbbiscan.cpp
      5 //
      6 //  Copyright (C) 2002-2016, International Business Machines Corporation and others.
      7 //  All Rights Reserved.
      8 //
      9 //  This file contains the Rule Based Break Iterator Rule Builder functions for
     10 //   scanning the rules and assembling a parse tree.  This is the first phase
     11 //   of compiling the rules.
     12 //
     13 //  The overall of the rules is managed by class RBBIRuleBuilder, which will
     14 //  create and use an instance of this class as part of the process.
     15 //
     16 
     17 #include "unicode/utypes.h"
     18 
     19 #if !UCONFIG_NO_BREAK_ITERATION
     20 
     21 #include "unicode/unistr.h"
     22 #include "unicode/uniset.h"
     23 #include "unicode/uchar.h"
     24 #include "unicode/uchriter.h"
     25 #include "unicode/parsepos.h"
     26 #include "unicode/parseerr.h"
     27 #include "cmemory.h"
     28 #include "cstring.h"
     29 
     30 #include "rbbirpt.h"   // Contains state table for the rbbi rules parser.
     31                       //   generated by a Perl script.
     32 #include "rbbirb.h"
     33 #include "rbbinode.h"
     34 #include "rbbiscan.h"
     35 #include "rbbitblb.h"
     36 
     37 #include "uassert.h"
     38 
     39 //------------------------------------------------------------------------------
     40 //
     41 // Unicode Set init strings for each of the character classes needed for parsing a rule file.
     42 //               (Initialized with hex values for portability to EBCDIC based machines.
     43 //                Really ugly, but there's no good way to avoid it.)
     44 //
     45 //              The sets are referred to by name in the rbbirpt.txt, which is the
     46 //              source form of the state transition table for the RBBI rule parser.
     47 //
     48 //------------------------------------------------------------------------------
     49 static const char16_t gRuleSet_rule_char_pattern[]       = {
     50 // Characters that may appear as literals in patterns without escaping or quoting.
     51 //   [    ^      [    \     p     {      Z     }     \     u    0      0    2      0
     52    0x5b, 0x5e, 0x5b, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5c, 0x75, 0x30, 0x30, 0x32, 0x30,
     53 //   -    \      u    0     0     7      f     ]     -     [    \      p
     54    0x2d, 0x5c, 0x75, 0x30, 0x30, 0x37, 0x66, 0x5d, 0x2d, 0x5b, 0x5c, 0x70,
     55 //   {     L     }    ]     -     [      \     p     {     N    }      ]     ]
     56    0x7b, 0x4c, 0x7d, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0x5d, 0};
     57 
     58 static const char16_t gRuleSet_name_char_pattern[]       = {
     59 //    [    _      \    p     {     L      }     \     p     {    N      }     ]
     60    0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0};
     61 
     62 static const char16_t gRuleSet_digit_char_pattern[] = {
     63 //    [    0      -    9     ]
     64    0x5b, 0x30, 0x2d, 0x39, 0x5d, 0};
     65 
     66 static const char16_t gRuleSet_name_start_char_pattern[] = {
     67 //    [    _      \    p     {     L      }     ]
     68    0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5d, 0 };
     69 
     70 static const char16_t kAny[] = {0x61, 0x6e, 0x79, 0x00};  // "any"
     71 
     72 
     73 U_CDECL_BEGIN
     74 static void U_CALLCONV RBBISetTable_deleter(void *p) {
     75    icu::RBBISetTableEl *px = (icu::RBBISetTableEl *)p;
     76    delete px->key;
     77    // Note:  px->val is owned by the linked list "fSetsListHead" in scanner.
     78    //        Don't delete the value nodes here.
     79    uprv_free(px);
     80 }
     81 U_CDECL_END
     82 
     83 U_NAMESPACE_BEGIN
     84 
     85 //------------------------------------------------------------------------------
     86 //
     87 //  Constructor.
     88 //
     89 //------------------------------------------------------------------------------
     90 RBBIRuleScanner::RBBIRuleScanner(RBBIRuleBuilder *rb)
     91 {
     92    fRB                 = rb;
     93    fScanIndex          = 0;
     94    fNextIndex          = 0;
     95    fQuoteMode          = false;
     96    fLineNum            = 1;
     97    fCharNum            = 0;
     98    fLastChar           = 0;
     99    
    100    fStateTable         = nullptr;
    101    fStack[0]           = 0;
    102    fStackPtr           = 0;
    103    fNodeStack[0]       = nullptr;
    104    fNodeStackPtr       = 0;
    105 
    106    fReverseRule        = false;
    107    fLookAheadRule      = false;
    108    fNoChainInRule      = false;
    109 
    110    fSymbolTable        = nullptr;
    111    fSetTable           = nullptr;
    112    fRuleNum            = 0;
    113    fOptionStart        = 0;
    114 
    115    // Do not check status until after all critical fields are sufficiently initialized
    116    //   that the destructor can run cleanly.
    117    if (U_FAILURE(*rb->fStatus)) {
    118        return;
    119    }
    120 
    121    //
    122    //  Set up the constant Unicode Sets.
    123    //     Note:  These could be made static, lazily initialized, and shared among
    124    //            all instances of RBBIRuleScanners.  BUT this is quite a bit simpler,
    125    //            and the time to build these few sets should be small compared to a
    126    //            full break iterator build.
    127    fRuleSets[kRuleSet_rule_char-128]
    128        = UnicodeSet(UnicodeString(gRuleSet_rule_char_pattern),       *rb->fStatus);
    129    // fRuleSets[kRuleSet_white_space-128] = [:Pattern_White_Space:]
    130    fRuleSets[kRuleSet_white_space-128].
    131        add(9, 0xd).add(0x20).add(0x85).add(0x200e, 0x200f).add(0x2028, 0x2029);
    132    fRuleSets[kRuleSet_name_char-128]
    133        = UnicodeSet(UnicodeString(gRuleSet_name_char_pattern),       *rb->fStatus);
    134    fRuleSets[kRuleSet_name_start_char-128]
    135        = UnicodeSet(UnicodeString(gRuleSet_name_start_char_pattern), *rb->fStatus);
    136    fRuleSets[kRuleSet_digit_char-128]
    137        = UnicodeSet(UnicodeString(gRuleSet_digit_char_pattern),      *rb->fStatus);
    138    if (*rb->fStatus == U_ILLEGAL_ARGUMENT_ERROR) {
    139        // This case happens if ICU's data is missing.  UnicodeSet tries to look up property
    140        //   names from the init string, can't find them, and claims an illegal argument.
    141        //   Change the error so that the actual problem will be clearer to users.
    142        *rb->fStatus = U_BRK_INIT_ERROR;
    143    }
    144    if (U_FAILURE(*rb->fStatus)) {
    145        return;
    146    }
    147 
    148    fSymbolTable = new RBBISymbolTable(this, rb->fRules, *rb->fStatus);
    149    if (fSymbolTable == nullptr) {
    150        *rb->fStatus = U_MEMORY_ALLOCATION_ERROR;
    151        return;
    152    }
    153    fSetTable    = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, nullptr, rb->fStatus);
    154    if (U_FAILURE(*rb->fStatus)) {
    155        return;
    156    }
    157    uhash_setValueDeleter(fSetTable, RBBISetTable_deleter);
    158 }
    159 
    160 
    161 
    162 //------------------------------------------------------------------------------
    163 //
    164 //  Destructor
    165 //
    166 //------------------------------------------------------------------------------
    167 RBBIRuleScanner::~RBBIRuleScanner() {
    168    delete fSymbolTable;
    169    if (fSetTable != nullptr) {
    170         uhash_close(fSetTable);
    171         fSetTable = nullptr;
    172 
    173    }
    174 
    175 
    176    // Node Stack.
    177    //   Normally has one entry, which is the entire parse tree for the rules.
    178    //   If errors occurred, there may be additional subtrees left on the stack.
    179    while (fNodeStackPtr > 0) {
    180        delete fNodeStack[fNodeStackPtr];
    181        fNodeStackPtr--;
    182    }
    183 
    184 }
    185 
    186 //------------------------------------------------------------------------------
    187 //
    188 //  doParseAction        Do some action during rule parsing.
    189 //                       Called by the parse state machine.
    190 //                       Actions build the parse tree and Unicode Sets,
    191 //                       and maintain the parse stack for nested expressions.
    192 //
    193 //                       TODO:  unify EParseAction and RBBI_RuleParseAction enum types.
    194 //                              They represent exactly the same thing.  They're separate
    195 //                              only to work around enum forward declaration restrictions
    196 //                              in some compilers, while at the same time avoiding multiple
    197 //                              definitions problems.  I'm sure that there's a better way.
    198 //
    199 //------------------------------------------------------------------------------
    200 UBool RBBIRuleScanner::doParseActions(int32_t action)
    201 {
    202    RBBINode *n       = nullptr;
    203 
    204    UBool   returnVal = true;
    205 
    206    switch (action) {
    207 
    208    case doExprStart:
    209        pushNewNode(RBBINode::opStart);
    210        fRuleNum++;
    211        break;
    212 
    213 
    214    case doNoChain:
    215        // Scanned a '^' while on the rule start state.
    216        fNoChainInRule = true;
    217        break;
    218 
    219 
    220    case doExprOrOperator:
    221        {
    222            fixOpStack(RBBINode::precOpCat);
    223            RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
    224            RBBINode  *orNode      = pushNewNode(RBBINode::opOr);
    225            if (U_FAILURE(*fRB->fStatus)) {
    226                break;
    227            }
    228            orNode->fLeftChild     = operandNode;
    229            operandNode->fParent   = orNode;
    230        }
    231        break;
    232 
    233    case doExprCatOperator:
    234        // concatenation operator.
    235        // For the implicit concatenation of adjacent terms in an expression that are
    236        //   not separated by any other operator.  Action is invoked between the
    237        //   actions for the two terms.
    238        {
    239            fixOpStack(RBBINode::precOpCat);
    240            RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
    241            RBBINode  *catNode     = pushNewNode(RBBINode::opCat);
    242            if (U_FAILURE(*fRB->fStatus)) {
    243                break;
    244            }
    245            catNode->fLeftChild    = operandNode;
    246            operandNode->fParent   = catNode;
    247        }
    248        break;
    249 
    250    case doLParen:
    251        // Open Paren.
    252        //   The openParen node is a dummy operation type with a low precedence,
    253        //     which has the affect of ensuring that any real binary op that
    254        //     follows within the parens binds more tightly to the operands than
    255        //     stuff outside of the parens.
    256        pushNewNode(RBBINode::opLParen);
    257        break;
    258 
    259    case doExprRParen:
    260        fixOpStack(RBBINode::precLParen);
    261        break;
    262 
    263    case doNOP:
    264        break;
    265 
    266    case doStartAssign:
    267        // We've just scanned "$variable = "
    268        // The top of the node stack has the $variable ref node.
    269 
    270        // Save the start position of the RHS text in the StartExpression node
    271        //   that precedes the $variableReference node on the stack.
    272        //   This will eventually be used when saving the full $variable replacement
    273        //   text as a string.
    274        n = fNodeStack[fNodeStackPtr-1];
    275        n->fFirstPos = fNextIndex;              // move past the '='
    276 
    277        // Push a new start-of-expression node; needed to keep parse of the
    278        //   RHS expression happy.
    279        pushNewNode(RBBINode::opStart);
    280        break;
    281 
    282 
    283 
    284 
    285    case doEndAssign:
    286        {
    287            // We have reached the end of an assignment statement.
    288            //   Current scan char is the ';' that terminates the assignment.
    289 
    290            // Terminate expression, leaves expression parse tree rooted in TOS node.
    291            fixOpStack(RBBINode::precStart);
    292            if (U_FAILURE(*fRB->fStatus)) {
    293                break;
    294            }
    295 
    296            RBBINode *startExprNode  = fNodeStack[fNodeStackPtr-2];
    297            RBBINode *varRefNode     = fNodeStack[fNodeStackPtr-1];
    298            RBBINode *RHSExprNode    = fNodeStack[fNodeStackPtr];
    299 
    300            // Save original text of right side of assignment, excluding the terminating ';'
    301            //  in the root of the node for the right-hand-side expression.
    302            RHSExprNode->fFirstPos = startExprNode->fFirstPos;
    303            RHSExprNode->fLastPos  = fScanIndex;
    304            fRB->fRules.extractBetween(RHSExprNode->fFirstPos, RHSExprNode->fLastPos, RHSExprNode->fText);
    305 
    306            // Expression parse tree becomes l. child of the $variable reference node.
    307            varRefNode->fLeftChild = RHSExprNode;
    308            RHSExprNode->fParent   = varRefNode;
    309 
    310            // Make a symbol table entry for the $variableRef node.
    311            fSymbolTable->addEntry(varRefNode->fText, varRefNode, *fRB->fStatus);
    312            if (U_FAILURE(*fRB->fStatus)) {
    313                // This is a round-about way to get the parse position set
    314                //  so that duplicate symbols error messages include a line number.
    315                UErrorCode t = *fRB->fStatus;
    316                *fRB->fStatus = U_ZERO_ERROR;
    317                error(t);
    318                // When adding $variableRef to the symbol table fail, Delete
    319                // both nodes because deleting varRefNode will not delete
    320                // RHSExprNode internally.
    321                delete RHSExprNode;
    322                delete varRefNode;
    323            }
    324 
    325            // Clean up the stack.
    326            delete startExprNode;
    327            fNodeStackPtr-=3;
    328            break;
    329        }
    330 
    331    case doEndOfRule:
    332        {
    333        fixOpStack(RBBINode::precStart);      // Terminate expression, leaves expression
    334        if (U_FAILURE(*fRB->fStatus)) {       //   parse tree rooted in TOS node.
    335            break;
    336        }
    337 #ifdef RBBI_DEBUG
    338        if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rtree")) {printNodeStack("end of rule");}
    339 #endif
    340        U_ASSERT(fNodeStackPtr == 1);
    341        RBBINode *thisRule = fNodeStack[fNodeStackPtr];
    342 
    343        // If this rule includes a look-ahead '/', add a endMark node to the
    344        //   expression tree.
    345        if (fLookAheadRule) {
    346            RBBINode  *endNode        = pushNewNode(RBBINode::endMark);
    347            RBBINode  *catNode        = pushNewNode(RBBINode::opCat);
    348            if (U_FAILURE(*fRB->fStatus)) {
    349                break;
    350            }
    351            fNodeStackPtr -= 2;
    352            catNode->fLeftChild       = thisRule;
    353            catNode->fRightChild      = endNode;
    354            fNodeStack[fNodeStackPtr] = catNode;
    355            endNode->fVal             = fRuleNum;
    356            endNode->fLookAheadEnd    = true;
    357            thisRule                  = catNode;
    358 
    359            // TODO: Disable chaining out of look-ahead (hard break) rules.
    360            //   The break on rule match is forced, so there is no point in building up
    361            //   the state table to chain into another rule for a longer match.
    362        }
    363 
    364        // Mark this node as being the root of a rule.
    365        thisRule->fRuleRoot = true;
    366 
    367        // Flag if chaining into this rule is wanted.
    368        //    
    369        if (fRB->fChainRules &&         // If rule chaining is enabled globally via !!chain
    370                !fNoChainInRule) {      //     and no '^' chain-in inhibit was on this rule
    371            thisRule->fChainIn = true;
    372        }
    373 
    374 
    375        // All rule expressions are ORed together.
    376        // The ';' that terminates an expression really just functions as a '|' with
    377        //   a low operator prededence.
    378        //
    379        // Each of the four sets of rules are collected separately.
    380        //  (forward, reverse, safe_forward, safe_reverse)
    381        //  OR this rule into the appropriate group of them.
    382        //
    383        RBBINode **destRules = (fReverseRule? &fRB->fSafeRevTree : fRB->fDefaultTree);
    384 
    385        if (*destRules != nullptr) {
    386            // This is not the first rule encountered.
    387            // OR previous stuff  (from *destRules)
    388            // with the current rule expression (on the Node Stack)
    389            //  with the resulting OR expression going to *destRules
    390            //
    391                       thisRule    = fNodeStack[fNodeStackPtr];
    392            RBBINode  *prevRules   = *destRules;
    393            RBBINode  *orNode      = pushNewNode(RBBINode::opOr);
    394            if (U_FAILURE(*fRB->fStatus)) {
    395                break;
    396            }
    397            orNode->fLeftChild     = prevRules;
    398            prevRules->fParent     = orNode;
    399            orNode->fRightChild    = thisRule;
    400            thisRule->fParent      = orNode;
    401            *destRules             = orNode;
    402        }
    403        else
    404        {
    405            // This is the first rule encountered (for this direction).
    406            // Just move its parse tree from the stack to *destRules.
    407            *destRules = fNodeStack[fNodeStackPtr];
    408        }
    409        fReverseRule   = false;   // in preparation for the next rule.
    410        fLookAheadRule = false;
    411        fNoChainInRule = false;
    412        fNodeStackPtr  = 0;
    413        }
    414        break;
    415 
    416 
    417    case doRuleError:
    418        error(U_BRK_RULE_SYNTAX);
    419        returnVal = false;
    420        break;
    421 
    422 
    423    case doVariableNameExpectedErr:
    424        error(U_BRK_RULE_SYNTAX);
    425        break;
    426 
    427 
    428    //
    429    //  Unary operands  + ? *
    430    //    These all appear after the operand to which they apply.
    431    //    When we hit one, the operand (may be a whole sub expression)
    432    //    will be on the top of the stack.
    433    //    Unary Operator becomes TOS, with the old TOS as its one child.
    434    case doUnaryOpPlus:
    435        {
    436            RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
    437            RBBINode  *plusNode    = pushNewNode(RBBINode::opPlus);
    438            if (U_FAILURE(*fRB->fStatus)) {
    439                break;
    440            }
    441            plusNode->fLeftChild   = operandNode;
    442            operandNode->fParent   = plusNode;
    443        }
    444        break;
    445 
    446    case doUnaryOpQuestion:
    447        {
    448            RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
    449            RBBINode  *qNode       = pushNewNode(RBBINode::opQuestion);
    450            if (U_FAILURE(*fRB->fStatus)) {
    451                break;
    452            }
    453            qNode->fLeftChild      = operandNode;
    454            operandNode->fParent   = qNode;
    455        }
    456        break;
    457 
    458    case doUnaryOpStar:
    459        {
    460            RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
    461            RBBINode  *starNode    = pushNewNode(RBBINode::opStar);
    462            if (U_FAILURE(*fRB->fStatus)) {
    463                break;
    464            }
    465            starNode->fLeftChild   = operandNode;
    466            operandNode->fParent   = starNode;
    467        }
    468        break;
    469 
    470    case doRuleChar:
    471        // A "Rule Character" is any single character that is a literal part
    472        // of the regular expression.  Like a, b and c in the expression "(abc*) | [:L:]"
    473        // These are pretty uncommon in break rules; the terms are more commonly
    474        //  sets.  To keep things uniform, treat these characters like as
    475        // sets that just happen to contain only one character.
    476        {
    477            n = pushNewNode(RBBINode::setRef);
    478            if (U_FAILURE(*fRB->fStatus)) {
    479                break;
    480            }
    481            findSetFor(UnicodeString(fC.fChar), n);
    482            n->fFirstPos = fScanIndex;
    483            n->fLastPos  = fNextIndex;
    484            fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
    485            break;
    486        }
    487 
    488    case doDotAny:
    489        // scanned a ".", meaning match any single character.
    490        {
    491            n = pushNewNode(RBBINode::setRef);
    492            if (U_FAILURE(*fRB->fStatus)) {
    493                break;
    494            }
    495            findSetFor(UnicodeString(true, kAny, 3), n);
    496            n->fFirstPos = fScanIndex;
    497            n->fLastPos  = fNextIndex;
    498            fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
    499            break;
    500        }
    501 
    502    case doSlash:
    503        // Scanned a '/', which identifies a look-ahead break position in a rule.
    504        n = pushNewNode(RBBINode::lookAhead);
    505        if (U_FAILURE(*fRB->fStatus)) {
    506            break;
    507        }
    508        n->fVal      = fRuleNum;
    509        n->fFirstPos = fScanIndex;
    510        n->fLastPos  = fNextIndex;
    511        fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
    512        fLookAheadRule = true;
    513        break;
    514 
    515 
    516    case doStartTagValue:
    517        // Scanned a '{', the opening delimiter for a tag value within a rule.
    518        n = pushNewNode(RBBINode::tag);
    519        if (U_FAILURE(*fRB->fStatus)) {
    520            break;
    521        }
    522        n->fVal      = 0;
    523        n->fFirstPos = fScanIndex;
    524        n->fLastPos  = fNextIndex;
    525        break;
    526 
    527    case doTagDigit:
    528        // Just scanned a decimal digit that's part of a tag value
    529        {
    530            n = fNodeStack[fNodeStackPtr];
    531            uint32_t v = u_charDigitValue(fC.fChar);
    532            U_ASSERT(v < 10);
    533            int64_t updated = static_cast<int64_t>(n->fVal)*10 + v;
    534            // Avoid overflow n->fVal
    535            if (updated > INT32_MAX) {
    536                error(U_BRK_RULE_SYNTAX);
    537                break;
    538            }
    539            n->fVal = static_cast<int32_t>(updated);
    540            break;
    541        }
    542 
    543    case doTagValue:
    544        n = fNodeStack[fNodeStackPtr];
    545        n->fLastPos = fNextIndex;
    546        fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
    547        break;
    548 
    549    case doTagExpectedError:
    550        error(U_BRK_MALFORMED_RULE_TAG);
    551        returnVal = false;
    552        break;
    553 
    554    case doOptionStart:
    555        // Scanning a !!option.   At the start of string.
    556        fOptionStart = fScanIndex;
    557        break;
    558 
    559    case doOptionEnd:
    560        {
    561            UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart);
    562            if (opt == UNICODE_STRING("chain", 5)) {
    563                fRB->fChainRules = true;
    564            } else if (opt == UNICODE_STRING("forward", 7)) {
    565                fRB->fDefaultTree   = &fRB->fForwardTree;
    566            } else if (opt == UNICODE_STRING("reverse", 7)) {
    567                fRB->fDefaultTree   = &fRB->fReverseTree;
    568            } else if (opt == UNICODE_STRING("safe_forward", 12)) {
    569                fRB->fDefaultTree   = &fRB->fSafeFwdTree;
    570            } else if (opt == UNICODE_STRING("safe_reverse", 12)) {
    571                fRB->fDefaultTree   = &fRB->fSafeRevTree;
    572            } else if (opt == UNICODE_STRING("lookAheadHardBreak", 18)) {
    573                fRB->fLookAheadHardBreak = true;
    574            } else if (opt == UNICODE_STRING("quoted_literals_only", 20)) {
    575                fRuleSets[kRuleSet_rule_char-128].clear();
    576            } else if (opt == UNICODE_STRING("unquoted_literals",  17)) {
    577                fRuleSets[kRuleSet_rule_char-128].applyPattern(UnicodeString(gRuleSet_rule_char_pattern), *fRB->fStatus);
    578            } else {
    579                error(U_BRK_UNRECOGNIZED_OPTION);
    580            }
    581        }
    582        break;
    583 
    584    case doReverseDir:
    585        fReverseRule = true;
    586        break;
    587 
    588    case doStartVariableName:
    589        n = pushNewNode(RBBINode::varRef);
    590        if (U_FAILURE(*fRB->fStatus)) {
    591            break;
    592        }
    593        n->fFirstPos = fScanIndex;
    594        break;
    595 
    596    case doEndVariableName:
    597        n = fNodeStack[fNodeStackPtr];
    598        if (n==nullptr || n->fType != RBBINode::varRef) {
    599            error(U_BRK_INTERNAL_ERROR);
    600            break;
    601        }
    602        n->fLastPos = fScanIndex;
    603        fRB->fRules.extractBetween(n->fFirstPos+1, n->fLastPos, n->fText);
    604        // Look the newly scanned name up in the symbol table
    605        //   If there's an entry, set the l. child of the var ref to the replacement expression.
    606        //   (We also pass through here when scanning assignments, but no harm is done, other
    607        //    than a slight wasted effort that seems hard to avoid.  Lookup will be null)
    608        n->fLeftChild = fSymbolTable->lookupNode(n->fText);
    609        break;
    610 
    611    case doCheckVarDef:
    612        n = fNodeStack[fNodeStackPtr];
    613        if (n->fLeftChild == nullptr) {
    614            error(U_BRK_UNDEFINED_VARIABLE);
    615            returnVal = false;
    616        }
    617        break;
    618 
    619    case doExprFinished:
    620        break;
    621 
    622    case doRuleErrorAssignExpr:
    623        error(U_BRK_ASSIGN_ERROR);
    624        returnVal = false;
    625        break;
    626 
    627    case doExit:
    628        returnVal = false;
    629        break;
    630 
    631    case doScanUnicodeSet:
    632        scanSet();
    633        break;
    634 
    635    default:
    636        error(U_BRK_INTERNAL_ERROR);
    637        returnVal = false;
    638        break;
    639    }
    640    return returnVal && U_SUCCESS(*fRB->fStatus);
    641 }
    642 
    643 
    644 
    645 
    646 //------------------------------------------------------------------------------
    647 //
    648 //  Error         Report a rule parse error.
    649 //                Only report it if no previous error has been recorded.
    650 //
    651 //------------------------------------------------------------------------------
    652 void RBBIRuleScanner::error(UErrorCode e) {
    653    if (U_SUCCESS(*fRB->fStatus)) {
    654        *fRB->fStatus = e;
    655        if (fRB->fParseError) {
    656            fRB->fParseError->line  = fLineNum;
    657            fRB->fParseError->offset = fCharNum;
    658            fRB->fParseError->preContext[0] = 0;
    659            fRB->fParseError->postContext[0] = 0;
    660        }
    661    }
    662 }
    663 
    664 
    665 
    666 
    667 //------------------------------------------------------------------------------
    668 //
    669 //  fixOpStack   The parse stack holds partially assembled chunks of the parse tree.
    670 //               An entry on the stack may be as small as a single setRef node,
    671 //               or as large as the parse tree
    672 //               for an entire expression (this will be the one item left on the stack
    673 //               when the parsing of an RBBI rule completes.
    674 //
    675 //               This function is called when a binary operator is encountered.
    676 //               It looks back up the stack for operators that are not yet associated
    677 //               with a right operand, and if the precedence of the stacked operator >=
    678 //               the precedence of the current operator, binds the operand left,
    679 //               to the previously encountered operator.
    680 //
    681 //------------------------------------------------------------------------------
    682 void RBBIRuleScanner::fixOpStack(RBBINode::OpPrecedence p) {
    683    RBBINode *n;
    684    // printNodeStack("entering fixOpStack()");
    685    for (;;) {
    686        n = fNodeStack[fNodeStackPtr-1];   // an operator node
    687        if (n->fPrecedence == 0) {
    688            RBBIDebugPuts("RBBIRuleScanner::fixOpStack, bad operator node");
    689            error(U_BRK_INTERNAL_ERROR);
    690            return;
    691        }
    692 
    693        if (n->fPrecedence < p || n->fPrecedence <= RBBINode::precLParen) {
    694            // The most recent operand goes with the current operator,
    695            //   not with the previously stacked one.
    696            break;
    697        }
    698            // Stack operator is a binary op  ( '|' or concatenation)
    699            //   TOS operand becomes right child of this operator.
    700            //   Resulting subexpression becomes the TOS operand.
    701            n->fRightChild = fNodeStack[fNodeStackPtr];
    702            fNodeStack[fNodeStackPtr]->fParent = n;
    703            fNodeStackPtr--;
    704        // printNodeStack("looping in fixOpStack()   ");
    705    }
    706 
    707    if (p <= RBBINode::precLParen) {
    708        // Scan is at a right paren or end of expression.
    709        //  The scanned item must match the stack, or else there was an error.
    710        //  Discard the left paren (or start expr) node from the stack,
    711            //  leaving the completed (sub)expression as TOS.
    712            if (n->fPrecedence != p) {
    713                // Right paren encountered matched start of expression node, or
    714                // end of expression matched with a left paren node.
    715                error(U_BRK_MISMATCHED_PAREN);
    716            }
    717            fNodeStack[fNodeStackPtr-1] = fNodeStack[fNodeStackPtr];
    718            fNodeStackPtr--;
    719            // Delete the now-discarded LParen or Start node.
    720            delete n;
    721    }
    722    // printNodeStack("leaving fixOpStack()");
    723 }
    724 
    725 
    726 
    727 
    728 //------------------------------------------------------------------------------
    729 //
    730 //   findSetFor    given a UnicodeString,
    731 //                  - find the corresponding Unicode Set  (uset node)
    732 //                         (create one if necessary)
    733 //                  - Set fLeftChild of the caller's node (should be a setRef node)
    734 //                         to the uset node
    735 //                 Maintain a hash table of uset nodes, so the same one is always used
    736 //                    for the same string.
    737 //                 If a "to adopt" set is provided and we haven't seen this key before,
    738 //                    add the provided set to the hash table.
    739 //                 If the string is one (32 bit) char in length, the set contains
    740 //                    just one element which is the char in question.
    741 //                 If the string is "any", return a set containing all chars.
    742 //
    743 //------------------------------------------------------------------------------
    744 void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt) {
    745 
    746    RBBISetTableEl   *el;
    747 
    748    // First check whether we've already cached a set for this string.
    749    // If so, just use the cached set in the new node.
    750    //   delete any set provided by the caller, since we own it.
    751    el = static_cast<RBBISetTableEl*>(uhash_get(fSetTable, &s));
    752    if (el != nullptr) {
    753        delete setToAdopt;
    754        node->fLeftChild = el->val;
    755        U_ASSERT(node->fLeftChild->fType == RBBINode::uset);
    756        return;
    757    }
    758 
    759    // Haven't seen this set before.
    760    // If the caller didn't provide us with a prebuilt set,
    761    //   create a new UnicodeSet now.
    762    if (setToAdopt == nullptr) {
    763        if (s.compare(kAny, -1) == 0) {
    764            setToAdopt = new UnicodeSet(0x000000, 0x10ffff);
    765        } else {
    766            UChar32 c;
    767            c = s.char32At(0);
    768            setToAdopt = new UnicodeSet(c, c);
    769        }
    770        if (setToAdopt == nullptr) {
    771            error(U_MEMORY_ALLOCATION_ERROR);
    772            return;
    773        }
    774    }
    775 
    776    //
    777    // Make a new uset node to refer to this UnicodeSet
    778    // This new uset node becomes the child of the caller's setReference node.
    779    //
    780    UErrorCode localStatus = U_ZERO_ERROR;
    781    RBBINode *usetNode    = new RBBINode(RBBINode::uset, localStatus);
    782    if (usetNode == nullptr) {
    783        localStatus = U_MEMORY_ALLOCATION_ERROR;
    784    }
    785    if (U_FAILURE(localStatus)) {
    786        delete usetNode;
    787        error(localStatus);
    788        delete setToAdopt;
    789        return;
    790    }
    791    usetNode->fInputSet   = setToAdopt;
    792    usetNode->fParent     = node;
    793    node->fLeftChild      = usetNode;
    794    usetNode->fText = s;
    795 
    796 
    797    //
    798    // Add the new uset node to the list of all uset nodes.
    799    //
    800    fRB->fUSetNodes->addElement(usetNode, *fRB->fStatus);
    801 
    802 
    803    //
    804    // Add the new set to the set hash table.
    805    //
    806    el = static_cast<RBBISetTableEl*>(uprv_malloc(sizeof(RBBISetTableEl)));
    807    UnicodeString *tkey = new UnicodeString(s);
    808    if (tkey == nullptr || el == nullptr || setToAdopt == nullptr) {
    809        // Delete to avoid memory leak
    810        delete tkey;
    811        tkey = nullptr;
    812        uprv_free(el);
    813        el = nullptr;
    814        delete setToAdopt;
    815        setToAdopt = nullptr;
    816 
    817        error(U_MEMORY_ALLOCATION_ERROR);
    818        return;
    819    }
    820    el->key = tkey;
    821    el->val = usetNode;
    822    uhash_put(fSetTable, el->key, el, fRB->fStatus);
    823 }
    824 
    825 
    826 
    827 //
    828 //  Assorted Unicode character constants.
    829 //     Numeric because there is no portable way to enter them as literals.
    830 //     (Think EBCDIC).
    831 //
    832 static const char16_t   chCR        = 0x0d;      // New lines, for terminating comments.
    833 static const char16_t   chLF        = 0x0a;
    834 static const char16_t   chNEL       = 0x85;      //    NEL newline variant
    835 static const char16_t   chLS        = 0x2028;    //    Unicode Line Separator
    836 static const char16_t   chApos      = 0x27;      //  single quote, for quoted chars.
    837 static const char16_t   chPound     = 0x23;      // '#', introduces a comment.
    838 static const char16_t   chBackSlash = 0x5c;      // '\'  introduces a char escape
    839 static const char16_t   chLParen    = 0x28;
    840 static const char16_t   chRParen    = 0x29;
    841 
    842 
    843 //------------------------------------------------------------------------------
    844 //
    845 //  stripRules    Return a rules string without extra spaces.
    846 //                (Comments are removed separately, during rule parsing.)
    847 //
    848 //------------------------------------------------------------------------------
    849 UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) {
    850    UnicodeString strippedRules;
    851    int32_t rulesLength = rules.length();
    852 
    853    for (int32_t idx=0; idx<rulesLength; idx = rules.moveIndex32(idx, 1)) {
    854        UChar32 cp = rules.char32At(idx);
    855        bool whiteSpace = u_hasBinaryProperty(cp, UCHAR_PATTERN_WHITE_SPACE);
    856        if (whiteSpace) {
    857            continue;
    858        }
    859        strippedRules.append(cp);
    860    }
    861    return strippedRules;
    862 }
    863 
    864 
    865 //------------------------------------------------------------------------------
    866 //
    867 //  nextCharLL    Low Level Next Char from rule input source.
    868 //                Get a char from the input character iterator,
    869 //                keep track of input position for error reporting.
    870 //
    871 //------------------------------------------------------------------------------
    872 UChar32  RBBIRuleScanner::nextCharLL() {
    873    UChar32  ch;
    874 
    875    if (fNextIndex >= fRB->fRules.length()) {
    876        return static_cast<UChar32>(-1);
    877    }
    878    ch         = fRB->fRules.char32At(fNextIndex);
    879    if (U_IS_SURROGATE(ch)) {
    880        error(U_ILLEGAL_CHAR_FOUND);
    881        return U_SENTINEL;
    882    }
    883    fNextIndex = fRB->fRules.moveIndex32(fNextIndex, 1);
    884 
    885    if (ch == chCR ||
    886        ch == chNEL ||
    887        ch == chLS   ||
    888        (ch == chLF && fLastChar != chCR)) {
    889        // Character is starting a new line.  Bump up the line number, and
    890        //  reset the column to 0.
    891        fLineNum++;
    892        fCharNum=0;
    893        if (fQuoteMode) {
    894            error(U_BRK_NEW_LINE_IN_QUOTED_STRING);
    895            fQuoteMode = false;
    896        }
    897    }
    898    else {
    899        // Character is not starting a new line.  Except in the case of a
    900        //   LF following a CR, increment the column position.
    901        if (ch != chLF) {
    902            fCharNum++;
    903        }
    904    }
    905    fLastChar = ch;
    906    return ch;
    907 }
    908 
    909 
    910 //------------------------------------------------------------------------------
    911 //
    912 //   nextChar     for rules scanning.  At this level, we handle stripping
    913 //                out comments and processing backslash character escapes.
    914 //                The rest of the rules grammar is handled at the next level up.
    915 //
    916 //------------------------------------------------------------------------------
    917 void RBBIRuleScanner::nextChar(RBBIRuleChar &c) {
    918 
    919    // Unicode Character constants needed for the processing done by nextChar(),
    920    //   in hex because literals wont work on EBCDIC machines.
    921 
    922    fScanIndex = fNextIndex;
    923    c.fChar    = nextCharLL();
    924    c.fEscaped = false;
    925 
    926    //
    927    //  check for '' sequence.
    928    //  These are recognized in all contexts, whether in quoted text or not.
    929    //
    930    if (c.fChar == chApos) {
    931        if (fRB->fRules.char32At(fNextIndex) == chApos) {
    932            c.fChar    = nextCharLL();        // get nextChar officially so character counts
    933            c.fEscaped = true;                //   stay correct.
    934        }
    935        else
    936        {
    937            // Single quote, by itself.
    938            //   Toggle quoting mode.
    939            //   Return either '('  or ')', because quotes cause a grouping of the quoted text.
    940            fQuoteMode = !fQuoteMode;
    941            if (fQuoteMode) {
    942                c.fChar = chLParen;
    943            } else {
    944                c.fChar = chRParen;
    945            }
    946            c.fEscaped = false;      // The paren that we return is not escaped.
    947            return;
    948        }
    949    }
    950 
    951    if (c.fChar == static_cast<UChar32>(-1)) {
    952        return;
    953    }
    954    if (fQuoteMode) {
    955        c.fEscaped = true;
    956    }
    957    else
    958    {
    959        // We are not in a 'quoted region' of the source.
    960        //
    961        if (c.fChar == chPound) {
    962            // Start of a comment.  Consume the rest of it.
    963            //  The new-line char that terminates the comment is always returned.
    964            //  It will be treated as white-space, and serves to break up anything
    965            //    that might otherwise incorrectly clump together with a comment in
    966            //    the middle (a variable name, for example.)
    967            int32_t commentStart = fScanIndex;
    968            for (;;) {
    969                c.fChar = nextCharLL();
    970                if (c.fChar == static_cast<UChar32>(-1) || // EOF
    971                    c.fChar == chCR     ||
    972                    c.fChar == chLF     ||
    973                    c.fChar == chNEL    ||
    974                    c.fChar == chLS)       {break;}
    975            }
    976            for (int32_t i=commentStart; i<fNextIndex-1; ++i) {
    977                fRB->fStrippedRules.setCharAt(i, u' ');
    978            }
    979        }
    980        if (c.fChar == static_cast<UChar32>(-1)) {
    981            return;
    982        }
    983 
    984        //
    985        //  check for backslash escaped characters.
    986        //  Use UnicodeString::unescapeAt() to handle them.
    987        //
    988        if (c.fChar == chBackSlash) {
    989            c.fEscaped = true;
    990            int32_t startX = fNextIndex;
    991            c.fChar = fRB->fRules.unescapeAt(fNextIndex);
    992            if (fNextIndex == startX) {
    993                error(U_BRK_HEX_DIGITS_EXPECTED);
    994            }
    995            fCharNum += fNextIndex-startX;
    996        }
    997    }
    998    // putc(c.fChar, stdout);
    999 }
   1000 
   1001 //------------------------------------------------------------------------------
   1002 //
   1003 //  Parse RBBI rules.   The state machine for rules parsing is here.
   1004 //                      The state tables are hand-written in the file rbbirpt.txt,
   1005 //                      and converted to the form used here by a perl
   1006 //                      script rbbicst.pl
   1007 //
   1008 //------------------------------------------------------------------------------
   1009 void RBBIRuleScanner::parse() {
   1010    uint16_t                state;
   1011    const RBBIRuleTableEl  *tableEl;
   1012 
   1013    if (U_FAILURE(*fRB->fStatus)) {
   1014        return;
   1015    }
   1016 
   1017    state = 1;
   1018    nextChar(fC);
   1019    //
   1020    // Main loop for the rule parsing state machine.
   1021    //   Runs once per state transition.
   1022    //   Each time through optionally performs, depending on the state table,
   1023    //      - an advance to the next input char
   1024    //      - an action to be performed.
   1025    //      - pushing or popping a state to/from the local state return stack.
   1026    //
   1027    for (;;) {
   1028        //  Bail out if anything has gone wrong.
   1029        //  RBBI rule file parsing stops on the first error encountered.
   1030        if (U_FAILURE(*fRB->fStatus)) {
   1031            break;
   1032        }
   1033 
   1034        // Quit if state == 0.  This is the normal way to exit the state machine.
   1035        //
   1036        if (state == 0) {
   1037            break;
   1038        }
   1039 
   1040        // Find the state table element that matches the input char from the rule, or the
   1041        //    class of the input character.  Start with the first table row for this
   1042        //    state, then linearly scan forward until we find a row that matches the
   1043        //    character.  The last row for each state always matches all characters, so
   1044        //    the search will stop there, if not before.
   1045        //
   1046        tableEl = &gRuleParseStateTable[state];
   1047        #ifdef RBBI_DEBUG
   1048            if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) {
   1049                RBBIDebugPrintf("char, line, col = (\'%c\', %d, %d)    state=%s ",
   1050                    fC.fChar, fLineNum, fCharNum, RBBIRuleStateNames[state]);
   1051            }
   1052        #endif
   1053 
   1054        for (;;) {
   1055            #ifdef RBBI_DEBUG
   1056                if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf("."); fflush(stdout);}
   1057            #endif
   1058            if (tableEl->fCharClass < 127 && fC.fEscaped == false &&   tableEl->fCharClass == fC.fChar) {
   1059                // Table row specified an individual character, not a set, and
   1060                //   the input character is not escaped, and
   1061                //   the input character matched it.
   1062                break;
   1063            }
   1064            if (tableEl->fCharClass == 255) {
   1065                // Table row specified default, match anything character class.
   1066                break;
   1067            }
   1068            if (tableEl->fCharClass == 254 && fC.fEscaped)  {
   1069                // Table row specified "escaped" and the char was escaped.
   1070                break;
   1071            }
   1072            if (tableEl->fCharClass == 253 && fC.fEscaped &&
   1073                (fC.fChar == 0x50 || fC.fChar == 0x70 ))  {
   1074                // Table row specified "escaped P" and the char is either 'p' or 'P'.
   1075                break;
   1076            }
   1077            if (tableEl->fCharClass == 252 && fC.fChar == static_cast<UChar32>(-1)) {
   1078                // Table row specified eof and we hit eof on the input.
   1079                break;
   1080            }
   1081 
   1082            if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
   1083                fC.fEscaped == false &&                                      //   char is not escaped &&
   1084                fC.fChar != static_cast<UChar32>(-1)) {                      //   char is not EOF
   1085                U_ASSERT((tableEl->fCharClass-128) < UPRV_LENGTHOF(fRuleSets));
   1086                if (fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
   1087                    // Table row specified a character class, or set of characters,
   1088                    //   and the current char matches it.
   1089                    break;
   1090                }
   1091            }
   1092 
   1093            // No match on this row, advance to the next  row for this state,
   1094            tableEl++;
   1095        }
   1096        if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPuts("");}
   1097 
   1098        //
   1099        // We've found the row of the state table that matches the current input
   1100        //   character from the rules string.
   1101        // Perform any action specified  by this row in the state table.
   1102        if (doParseActions(static_cast<int32_t>(tableEl->fAction)) == false) {
   1103            // Break out of the state machine loop if the
   1104            //   the action signalled some kind of error, or
   1105            //   the action was to exit, occurs on normal end-of-rules-input.
   1106            break;
   1107        }
   1108 
   1109        if (tableEl->fPushState != 0) {
   1110            fStackPtr++;
   1111            if (fStackPtr >= kStackSize) {
   1112                error(U_BRK_INTERNAL_ERROR);
   1113                RBBIDebugPuts("RBBIRuleScanner::parse() - state stack overflow.");
   1114                fStackPtr--;
   1115            }
   1116            fStack[fStackPtr] = tableEl->fPushState;
   1117        }
   1118 
   1119        if (tableEl->fNextChar) {
   1120            nextChar(fC);
   1121        }
   1122 
   1123        // Get the next state from the table entry, or from the
   1124        //   state stack if the next state was specified as "pop".
   1125        if (tableEl->fNextState != 255) {
   1126            state = tableEl->fNextState;
   1127        } else {
   1128            state = fStack[fStackPtr];
   1129            fStackPtr--;
   1130            if (fStackPtr < 0) {
   1131                error(U_BRK_INTERNAL_ERROR);
   1132                RBBIDebugPuts("RBBIRuleScanner::parse() - state stack underflow.");
   1133                fStackPtr++;
   1134            }
   1135        }
   1136 
   1137    }
   1138 
   1139    if (U_FAILURE(*fRB->fStatus)) {
   1140        return;
   1141    }
   1142    
   1143    // If there are no forward rules set an error.
   1144    //
   1145    if (fRB->fForwardTree == nullptr) {
   1146        error(U_BRK_RULE_SYNTAX);
   1147        return;
   1148    }
   1149 
   1150    //
   1151    // Parsing of the input RBBI rules is complete.
   1152    // We now have a parse tree for the rule expressions
   1153    // and a list of all UnicodeSets that are referenced.
   1154    //
   1155 #ifdef RBBI_DEBUG
   1156    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) {fSymbolTable->rbbiSymtablePrint();}
   1157    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ptree")) {
   1158        RBBIDebugPrintf("Completed Forward Rules Parse Tree...\n");
   1159        RBBINode::printTree(fRB->fForwardTree, true);
   1160        RBBIDebugPrintf("\nCompleted Reverse Rules Parse Tree...\n");
   1161        RBBINode::printTree(fRB->fReverseTree, true);
   1162        RBBIDebugPrintf("\nCompleted Safe Point Forward Rules Parse Tree...\n");
   1163        RBBINode::printTree(fRB->fSafeFwdTree, true);
   1164        RBBIDebugPrintf("\nCompleted Safe Point Reverse Rules Parse Tree...\n");
   1165        RBBINode::printTree(fRB->fSafeRevTree, true);
   1166    }
   1167 #endif
   1168 }
   1169 
   1170 
   1171 //------------------------------------------------------------------------------
   1172 //
   1173 //  printNodeStack     for debugging...
   1174 //
   1175 //------------------------------------------------------------------------------
   1176 #ifdef RBBI_DEBUG
   1177 void RBBIRuleScanner::printNodeStack(const char *title) {
   1178    int i;
   1179    RBBIDebugPrintf("%s.  Dumping node stack...\n", title);
   1180    for (i=fNodeStackPtr; i>0; i--) {RBBINode::printTree(fNodeStack[i], true);}
   1181 }
   1182 #endif
   1183 
   1184 
   1185 
   1186 
   1187 //------------------------------------------------------------------------------
   1188 //
   1189 //  pushNewNode   create a new RBBINode of the specified type and push it
   1190 //                onto the stack of nodes.
   1191 //
   1192 //------------------------------------------------------------------------------
   1193 RBBINode  *RBBIRuleScanner::pushNewNode(RBBINode::NodeType  t) {
   1194    if (U_FAILURE(*fRB->fStatus)) {
   1195        return nullptr;
   1196    }
   1197    if (fNodeStackPtr >= kStackSize - 1) {
   1198        error(U_BRK_RULE_SYNTAX);
   1199        RBBIDebugPuts("RBBIRuleScanner::pushNewNode - stack overflow.");
   1200        return nullptr;
   1201    }
   1202    fNodeStackPtr++;
   1203    fNodeStack[fNodeStackPtr] = new RBBINode(t, *fRB->fStatus);
   1204    if (fNodeStack[fNodeStackPtr] == nullptr) {
   1205        *fRB->fStatus = U_MEMORY_ALLOCATION_ERROR;
   1206    }
   1207    return fNodeStack[fNodeStackPtr];
   1208 }
   1209 
   1210 
   1211 
   1212 //------------------------------------------------------------------------------
   1213 //
   1214 //  scanSet    Construct a UnicodeSet from the text at the current scan
   1215 //             position.  Advance the scan position to the first character
   1216 //             after the set.
   1217 //
   1218 //             A new RBBI setref node referring to the set is pushed onto the node
   1219 //             stack.
   1220 //
   1221 //             The scan position is normally under the control of the state machine
   1222 //             that controls rule parsing.  UnicodeSets, however, are parsed by
   1223 //             the UnicodeSet constructor, not by the RBBI rule parser.
   1224 //
   1225 //------------------------------------------------------------------------------
   1226 void RBBIRuleScanner::scanSet() {
   1227    ParsePosition  pos;
   1228    int            startPos;
   1229    int            i;
   1230 
   1231    if (U_FAILURE(*fRB->fStatus)) {
   1232        return;
   1233    }
   1234 
   1235    pos.setIndex(fScanIndex);
   1236    startPos = fScanIndex;
   1237    UErrorCode localStatus = U_ZERO_ERROR;
   1238    LocalPointer<UnicodeSet> uset(new UnicodeSet(), localStatus);
   1239    if (U_FAILURE(localStatus)) {
   1240        error(localStatus);
   1241        return;
   1242    }
   1243    uset->applyPatternIgnoreSpace(fRB->fRules, pos, fSymbolTable, localStatus);
   1244    if (U_FAILURE(localStatus)) {
   1245        //  TODO:  Get more accurate position of the error from UnicodeSet's return info.
   1246        //         UnicodeSet appears to not be reporting correctly at this time.
   1247        #ifdef RBBI_DEBUG
   1248            RBBIDebugPrintf("UnicodeSet parse position.ErrorIndex = %d\n", pos.getIndex());
   1249        #endif
   1250        error(localStatus);
   1251        return;
   1252    }
   1253 
   1254    // Verify that the set contains at least one code point.
   1255    //
   1256    U_ASSERT(uset.isValid());
   1257    UnicodeSet tempSet(*uset);
   1258    // Use tempSet to handle the case that the UnicodeSet contains
   1259    // only string element, such as [{ab}] and treat it as empty set.
   1260    tempSet.removeAllStrings();
   1261    if (tempSet.isEmpty()) {
   1262        // This set is empty.
   1263        //  Make it an error, because it almost certainly is not what the user wanted.
   1264        //  Also, avoids having to think about corner cases in the tree manipulation code
   1265        //   that occurs later on.
   1266        error(U_BRK_RULE_EMPTY_SET);
   1267        return;
   1268    }
   1269 
   1270 
   1271    // Advance the RBBI parse position over the UnicodeSet pattern.
   1272    //   Don't just set fScanIndex because the line/char positions maintained
   1273    //   for error reporting would be thrown off.
   1274    i = pos.getIndex();
   1275    for (;U_SUCCESS(*fRB->fStatus);) {
   1276        if (fNextIndex >= i) {
   1277            break;
   1278        }
   1279        nextCharLL();
   1280    }
   1281 
   1282    if (U_SUCCESS(*fRB->fStatus)) {
   1283        RBBINode         *n;
   1284 
   1285        n = pushNewNode(RBBINode::setRef);
   1286        if (U_FAILURE(*fRB->fStatus)) {
   1287            return;
   1288        }
   1289        n->fFirstPos = startPos;
   1290        n->fLastPos  = fNextIndex;
   1291        fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
   1292        //  findSetFor() serves several purposes here:
   1293        //     - Adopts storage for the UnicodeSet, will be responsible for deleting.
   1294        //     - Maintains collection of all sets in use, needed later for establishing
   1295        //          character categories for run time engine.
   1296        //     - Eliminates mulitiple instances of the same set.
   1297        //     - Creates a new uset node if necessary (if this isn't a duplicate.)
   1298        findSetFor(n->fText, n, uset.orphan());
   1299    }
   1300 
   1301 }
   1302 
   1303 int32_t RBBIRuleScanner::numRules() {
   1304    return fRuleNum;
   1305 }
   1306 
   1307 U_NAMESPACE_END
   1308 
   1309 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE