rbbitblb.cpp (65404B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (c) 2002-2016, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 */ 9 // 10 // rbbitblb.cpp 11 // 12 13 14 #include "unicode/utypes.h" 15 16 #if !UCONFIG_NO_BREAK_ITERATION 17 18 #include "unicode/unistr.h" 19 #include "rbbitblb.h" 20 #include "rbbirb.h" 21 #include "rbbiscan.h" 22 #include "rbbisetb.h" 23 #include "rbbidata.h" 24 #include "cstring.h" 25 #include "uassert.h" 26 #include "uvectr32.h" 27 #include "cmemory.h" 28 29 U_NAMESPACE_BEGIN 30 31 const int32_t kMaxStateFor8BitsTable = 255; 32 33 RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status) : 34 fRB(rb), 35 fTree(*rootNode), 36 fStatus(&status), 37 fDStates(nullptr), 38 fSafeTable(nullptr) { 39 if (U_FAILURE(status)) { 40 return; 41 } 42 // fDStates is UVector<RBBIStateDescriptor *> 43 fDStates = new UVector(status); 44 if (U_SUCCESS(status) && fDStates == nullptr ) { 45 status = U_MEMORY_ALLOCATION_ERROR; 46 } 47 } 48 49 50 51 RBBITableBuilder::~RBBITableBuilder() { 52 int i; 53 for (i=0; i<fDStates->size(); i++) { 54 delete static_cast<RBBIStateDescriptor*>(fDStates->elementAt(i)); 55 } 56 delete fDStates; 57 delete fSafeTable; 58 delete fLookAheadRuleMap; 59 } 60 61 62 //----------------------------------------------------------------------------- 63 // 64 // RBBITableBuilder::buildForwardTable - This is the main function for building 65 // the DFA state transition table from the RBBI rules parse tree. 66 // 67 //----------------------------------------------------------------------------- 68 void RBBITableBuilder::buildForwardTable() { 69 70 if (U_FAILURE(*fStatus)) { 71 return; 72 } 73 74 // If there were no rules, just return. This situation can easily arise 75 // for the reverse rules. 76 if (fTree==nullptr) { 77 return; 78 } 79 80 // 81 // Walk through the tree, replacing any references to $variables with a copy of the 82 // parse tree for the substitution expression. 83 // 84 fTree = fTree->flattenVariables(*fStatus, 0); 85 if (U_FAILURE(*fStatus)) { 86 return; 87 } 88 #ifdef RBBI_DEBUG 89 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) { 90 RBBIDebugPuts("\nParse tree after flattening variable references."); 91 RBBINode::printTree(fTree, true); 92 } 93 #endif 94 95 // 96 // If the rules contained any references to {bof} 97 // add a {bof} <cat> <former root of tree> to the 98 // tree. Means that all matches must start out with the 99 // {bof} fake character. 100 // 101 if (fRB->fSetBuilder->sawBOF()) { 102 RBBINode *bofTop = new RBBINode(RBBINode::opCat, *fStatus); 103 if (bofTop == nullptr) { 104 *fStatus = U_MEMORY_ALLOCATION_ERROR; 105 } 106 if (U_FAILURE(*fStatus)) { 107 delete bofTop; 108 return; 109 } 110 RBBINode *bofLeaf = new RBBINode(RBBINode::leafChar, *fStatus); 111 // Delete and exit if memory allocation failed. 112 if (bofLeaf == nullptr) { 113 *fStatus = U_MEMORY_ALLOCATION_ERROR; 114 } 115 if (U_FAILURE(*fStatus)) { 116 delete bofLeaf; 117 delete bofTop; 118 return; 119 } 120 bofTop->fLeftChild = bofLeaf; 121 bofTop->fRightChild = fTree; 122 bofLeaf->fParent = bofTop; 123 bofLeaf->fVal = 2; // Reserved value for {bof}. 124 fTree = bofTop; 125 } 126 127 // 128 // Add a unique right-end marker to the expression. 129 // Appears as a cat-node, left child being the original tree, 130 // right child being the end marker. 131 // 132 RBBINode *cn = new RBBINode(RBBINode::opCat, *fStatus); 133 // Exit if memory allocation failed. 134 if (cn == nullptr) { 135 *fStatus = U_MEMORY_ALLOCATION_ERROR; 136 } 137 if (U_FAILURE(*fStatus)) { 138 delete cn; 139 return; 140 } 141 cn->fLeftChild = fTree; 142 fTree->fParent = cn; 143 RBBINode *endMarkerNode = cn->fRightChild = new RBBINode(RBBINode::endMark, *fStatus); 144 // Delete and exit if memory allocation failed. 145 if (cn->fRightChild == nullptr) { 146 *fStatus = U_MEMORY_ALLOCATION_ERROR; 147 } 148 if (U_FAILURE(*fStatus)) { 149 delete cn; 150 return; 151 } 152 cn->fRightChild->fParent = cn; 153 fTree = cn; 154 155 // 156 // Replace all references to UnicodeSets with the tree for the equivalent 157 // expression. 158 // 159 fTree->flattenSets(*fStatus, 0); 160 #ifdef RBBI_DEBUG 161 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) { 162 RBBIDebugPuts("\nParse tree after flattening Unicode Set references."); 163 RBBINode::printTree(fTree, true); 164 } 165 #endif 166 167 168 // 169 // calculate the functions nullable, firstpos, lastpos and followpos on 170 // nodes in the parse tree. 171 // See the algorithm description in Aho. 172 // Understanding how this works by looking at the code alone will be 173 // nearly impossible. 174 // 175 calcNullable(fTree); 176 calcFirstPos(fTree); 177 calcLastPos(fTree); 178 calcFollowPos(fTree); 179 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) { 180 RBBIDebugPuts("\n"); 181 printPosSets(fTree); 182 } 183 184 // 185 // For "chained" rules, modify the followPos sets 186 // 187 if (fRB->fChainRules) { 188 calcChainedFollowPos(fTree, endMarkerNode); 189 } 190 191 // 192 // BOF (start of input) test fixup. 193 // 194 if (fRB->fSetBuilder->sawBOF()) { 195 bofFixup(); 196 } 197 198 // 199 // Build the DFA state transition tables. 200 // 201 buildStateTable(); 202 mapLookAheadRules(); 203 flagAcceptingStates(); 204 flagLookAheadStates(); 205 flagTaggedStates(); 206 207 // 208 // Update the global table of rule status {tag} values 209 // The rule builder has a global vector of status values that are common 210 // for all tables. Merge the ones from this table into the global set. 211 // 212 mergeRuleStatusVals(); 213 } 214 215 216 217 //----------------------------------------------------------------------------- 218 // 219 // calcNullable. Impossible to explain succinctly. See Aho, section 3.9 220 // 221 //----------------------------------------------------------------------------- 222 void RBBITableBuilder::calcNullable(RBBINode *n) { 223 if (n == nullptr) { 224 return; 225 } 226 if (n->fType == RBBINode::setRef || 227 n->fType == RBBINode::endMark ) { 228 // These are non-empty leaf node types. 229 n->fNullable = false; 230 return; 231 } 232 233 if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) { 234 // Lookahead marker node. It's a leaf, so no recursion on children. 235 // It's nullable because it does not match any literal text from the input stream. 236 n->fNullable = true; 237 return; 238 } 239 240 241 // The node is not a leaf. 242 // Calculate nullable on its children. 243 calcNullable(n->fLeftChild); 244 calcNullable(n->fRightChild); 245 246 // Apply functions from table 3.40 in Aho 247 if (n->fType == RBBINode::opOr) { 248 n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable; 249 } 250 else if (n->fType == RBBINode::opCat) { 251 n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable; 252 } 253 else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) { 254 n->fNullable = true; 255 } 256 else { 257 n->fNullable = false; 258 } 259 } 260 261 262 263 264 //----------------------------------------------------------------------------- 265 // 266 // calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9 267 // 268 //----------------------------------------------------------------------------- 269 void RBBITableBuilder::calcFirstPos(RBBINode *n) { 270 if (n == nullptr) { 271 return; 272 } 273 if (n->fType == RBBINode::leafChar || 274 n->fType == RBBINode::endMark || 275 n->fType == RBBINode::lookAhead || 276 n->fType == RBBINode::tag) { 277 // These are non-empty leaf node types. 278 // Note: In order to maintain the sort invariant on the set, 279 // this function should only be called on a node whose set is 280 // empty to start with. 281 n->fFirstPosSet->addElement(n, *fStatus); 282 return; 283 } 284 285 // The node is not a leaf. 286 // Calculate firstPos on its children. 287 calcFirstPos(n->fLeftChild); 288 calcFirstPos(n->fRightChild); 289 290 // Apply functions from table 3.40 in Aho 291 if (n->fType == RBBINode::opOr) { 292 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet); 293 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet); 294 } 295 else if (n->fType == RBBINode::opCat) { 296 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet); 297 if (n->fLeftChild->fNullable) { 298 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet); 299 } 300 } 301 else if (n->fType == RBBINode::opStar || 302 n->fType == RBBINode::opQuestion || 303 n->fType == RBBINode::opPlus) { 304 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet); 305 } 306 } 307 308 309 310 //----------------------------------------------------------------------------- 311 // 312 // calcLastPos. Impossible to explain succinctly. See Aho, section 3.9 313 // 314 //----------------------------------------------------------------------------- 315 void RBBITableBuilder::calcLastPos(RBBINode *n) { 316 if (n == nullptr) { 317 return; 318 } 319 if (n->fType == RBBINode::leafChar || 320 n->fType == RBBINode::endMark || 321 n->fType == RBBINode::lookAhead || 322 n->fType == RBBINode::tag) { 323 // These are non-empty leaf node types. 324 // Note: In order to maintain the sort invariant on the set, 325 // this function should only be called on a node whose set is 326 // empty to start with. 327 n->fLastPosSet->addElement(n, *fStatus); 328 return; 329 } 330 331 // The node is not a leaf. 332 // Calculate lastPos on its children. 333 calcLastPos(n->fLeftChild); 334 calcLastPos(n->fRightChild); 335 336 // Apply functions from table 3.40 in Aho 337 if (n->fType == RBBINode::opOr) { 338 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet); 339 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet); 340 } 341 else if (n->fType == RBBINode::opCat) { 342 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet); 343 if (n->fRightChild->fNullable) { 344 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet); 345 } 346 } 347 else if (n->fType == RBBINode::opStar || 348 n->fType == RBBINode::opQuestion || 349 n->fType == RBBINode::opPlus) { 350 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet); 351 } 352 } 353 354 355 356 //----------------------------------------------------------------------------- 357 // 358 // calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9 359 // 360 //----------------------------------------------------------------------------- 361 void RBBITableBuilder::calcFollowPos(RBBINode *n) { 362 if (n == nullptr || 363 n->fType == RBBINode::leafChar || 364 n->fType == RBBINode::endMark) { 365 return; 366 } 367 368 calcFollowPos(n->fLeftChild); 369 calcFollowPos(n->fRightChild); 370 371 // Aho rule #1 372 if (n->fType == RBBINode::opCat) { 373 RBBINode *i; // is 'i' in Aho's description 374 uint32_t ix; 375 376 UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet; 377 378 for (ix = 0; ix < static_cast<uint32_t>(LastPosOfLeftChild->size()); ix++) { 379 i = static_cast<RBBINode*>(LastPosOfLeftChild->elementAt(ix)); 380 setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet); 381 } 382 } 383 384 // Aho rule #2 385 if (n->fType == RBBINode::opStar || 386 n->fType == RBBINode::opPlus) { 387 RBBINode *i; // again, n and i are the names from Aho's description. 388 uint32_t ix; 389 390 for (ix = 0; ix < static_cast<uint32_t>(n->fLastPosSet->size()); ix++) { 391 i = static_cast<RBBINode*>(n->fLastPosSet->elementAt(ix)); 392 setAdd(i->fFollowPos, n->fFirstPosSet); 393 } 394 } 395 396 397 398 } 399 400 //----------------------------------------------------------------------------- 401 // 402 // addRuleRootNodes Recursively walk a parse tree, adding all nodes flagged 403 // as roots of a rule to a destination vector. 404 // 405 //----------------------------------------------------------------------------- 406 void RBBITableBuilder::addRuleRootNodes(UVector *dest, RBBINode *node) { 407 if (node == nullptr || U_FAILURE(*fStatus)) { 408 return; 409 } 410 U_ASSERT(!dest->hasDeleter()); 411 if (node->fRuleRoot) { 412 dest->addElement(node, *fStatus); 413 // Note: rules cannot nest. If we found a rule start node, 414 // no child node can also be a start node. 415 return; 416 } 417 addRuleRootNodes(dest, node->fLeftChild); 418 addRuleRootNodes(dest, node->fRightChild); 419 } 420 421 //----------------------------------------------------------------------------- 422 // 423 // calcChainedFollowPos. Modify the previously calculated followPos sets 424 // to implement rule chaining. NOT described by Aho 425 // 426 //----------------------------------------------------------------------------- 427 void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree, RBBINode *endMarkNode) { 428 429 UVector leafNodes(*fStatus); 430 if (U_FAILURE(*fStatus)) { 431 return; 432 } 433 434 // get a list all leaf nodes 435 tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus); 436 if (U_FAILURE(*fStatus)) { 437 return; 438 } 439 440 // Collect all leaf nodes that can start matches for rules 441 // with inbound chaining enabled, which is the union of the 442 // firstPosition sets from each of the rule root nodes. 443 444 UVector ruleRootNodes(*fStatus); 445 addRuleRootNodes(&ruleRootNodes, tree); 446 447 UVector matchStartNodes(*fStatus); 448 for (int j=0; j<ruleRootNodes.size(); ++j) { 449 RBBINode *node = static_cast<RBBINode *>(ruleRootNodes.elementAt(j)); 450 if (node->fChainIn) { 451 setAdd(&matchStartNodes, node->fFirstPosSet); 452 } 453 } 454 if (U_FAILURE(*fStatus)) { 455 return; 456 } 457 458 int32_t endNodeIx; 459 int32_t startNodeIx; 460 461 for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) { 462 RBBINode* endNode = static_cast<RBBINode*>(leafNodes.elementAt(endNodeIx)); 463 464 // Identify leaf nodes that correspond to overall rule match positions. 465 // These include the endMarkNode in their followPos sets. 466 // 467 // Note: do not consider other end marker nodes, those that are added to 468 // look-ahead rules. These can't chain; a match immediately stops 469 // further matching. This leaves exactly one end marker node, the one 470 // at the end of the complete tree. 471 472 if (!endNode->fFollowPos->contains(endMarkNode)) { 473 continue; 474 } 475 476 // We've got a node that can end a match. 477 478 // Now iterate over the nodes that can start a match, looking for ones 479 // with the same char class as our ending node. 480 RBBINode *startNode; 481 for (startNodeIx = 0; startNodeIx<matchStartNodes.size(); startNodeIx++) { 482 startNode = static_cast<RBBINode*>(matchStartNodes.elementAt(startNodeIx)); 483 if (startNode->fType != RBBINode::leafChar) { 484 continue; 485 } 486 487 if (endNode->fVal == startNode->fVal) { 488 // The end val (character class) of one possible match is the 489 // same as the start of another. 490 491 // Add all nodes from the followPos of the start node to the 492 // followPos set of the end node, which will have the effect of 493 // letting matches transition from a match state at endNode 494 // to the second char of a match starting with startNode. 495 setAdd(endNode->fFollowPos, startNode->fFollowPos); 496 } 497 } 498 } 499 } 500 501 502 //----------------------------------------------------------------------------- 503 // 504 // bofFixup. Fixup for state tables that include {bof} beginning of input testing. 505 // Do an swizzle similar to chaining, modifying the followPos set of 506 // the bofNode to include the followPos nodes from other {bot} nodes 507 // scattered through the tree. 508 // 509 // This function has much in common with calcChainedFollowPos(). 510 // 511 //----------------------------------------------------------------------------- 512 void RBBITableBuilder::bofFixup() { 513 514 if (U_FAILURE(*fStatus)) { 515 return; 516 } 517 518 // The parse tree looks like this ... 519 // fTree root ---> <cat> 520 // / \ . 521 // <cat> <#end node> 522 // / \ . 523 // <bofNode> rest 524 // of tree 525 // 526 // We will be adding things to the followPos set of the <bofNode> 527 // 528 RBBINode *bofNode = fTree->fLeftChild->fLeftChild; 529 U_ASSERT(bofNode->fType == RBBINode::leafChar); 530 U_ASSERT(bofNode->fVal == 2); 531 532 // Get all nodes that can be the start a match of the user-written rules 533 // (excluding the fake bofNode) 534 // We want the nodes that can start a match in the 535 // part labeled "rest of tree" 536 // 537 UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet; 538 539 RBBINode *startNode; 540 int startNodeIx; 541 for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) { 542 startNode = static_cast<RBBINode*>(matchStartNodes->elementAt(startNodeIx)); 543 if (startNode->fType != RBBINode::leafChar) { 544 continue; 545 } 546 547 if (startNode->fVal == bofNode->fVal) { 548 // We found a leaf node corresponding to a {bof} that was 549 // explicitly written into a rule. 550 // Add everything from the followPos set of this node to the 551 // followPos set of the fake bofNode at the start of the tree. 552 // 553 setAdd(bofNode->fFollowPos, startNode->fFollowPos); 554 } 555 } 556 } 557 558 //----------------------------------------------------------------------------- 559 // 560 // buildStateTable() Determine the set of runtime DFA states and the 561 // transition tables for these states, by the algorithm 562 // of fig. 3.44 in Aho. 563 // 564 // Most of the comments are quotes of Aho's psuedo-code. 565 // 566 //----------------------------------------------------------------------------- 567 void RBBITableBuilder::buildStateTable() { 568 if (U_FAILURE(*fStatus)) { 569 return; 570 } 571 RBBIStateDescriptor *failState; 572 // Set it to nullptr to avoid uninitialized warning 573 RBBIStateDescriptor *initialState = nullptr; 574 // 575 // Add a dummy state 0 - the stop state. Not from Aho. 576 int lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1; 577 failState = new RBBIStateDescriptor(lastInputSymbol, fStatus); 578 if (failState == nullptr) { 579 *fStatus = U_MEMORY_ALLOCATION_ERROR; 580 goto ExitBuildSTdeleteall; 581 } 582 failState->fPositions = new UVector(*fStatus); 583 if (failState->fPositions == nullptr) { 584 *fStatus = U_MEMORY_ALLOCATION_ERROR; 585 } 586 if (failState->fPositions == nullptr || U_FAILURE(*fStatus)) { 587 goto ExitBuildSTdeleteall; 588 } 589 fDStates->addElement(failState, *fStatus); 590 if (U_FAILURE(*fStatus)) { 591 goto ExitBuildSTdeleteall; 592 } 593 594 // initially, the only unmarked state in Dstates is firstpos(root), 595 // where toot is the root of the syntax tree for (r)#; 596 initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus); 597 if (initialState == nullptr) { 598 *fStatus = U_MEMORY_ALLOCATION_ERROR; 599 } 600 if (U_FAILURE(*fStatus)) { 601 goto ExitBuildSTdeleteall; 602 } 603 initialState->fPositions = new UVector(*fStatus); 604 if (initialState->fPositions == nullptr) { 605 *fStatus = U_MEMORY_ALLOCATION_ERROR; 606 } 607 if (U_FAILURE(*fStatus)) { 608 goto ExitBuildSTdeleteall; 609 } 610 setAdd(initialState->fPositions, fTree->fFirstPosSet); 611 fDStates->addElement(initialState, *fStatus); 612 if (U_FAILURE(*fStatus)) { 613 goto ExitBuildSTdeleteall; 614 } 615 616 // while there is an unmarked state T in Dstates do begin 617 for (;;) { 618 RBBIStateDescriptor *T = nullptr; 619 int32_t tx; 620 for (tx=1; tx<fDStates->size(); tx++) { 621 RBBIStateDescriptor *temp; 622 temp = static_cast<RBBIStateDescriptor*>(fDStates->elementAt(tx)); 623 if (temp->fMarked == false) { 624 T = temp; 625 break; 626 } 627 } 628 if (T == nullptr) { 629 break; 630 } 631 632 // mark T; 633 T->fMarked = true; 634 635 // for each input symbol a do begin 636 int32_t a; 637 for (a = 1; a<=lastInputSymbol; a++) { 638 // let U be the set of positions that are in followpos(p) 639 // for some position p in T 640 // such that the symbol at position p is a; 641 UVector *U = nullptr; 642 RBBINode *p; 643 int32_t px; 644 for (px=0; px<T->fPositions->size(); px++) { 645 p = static_cast<RBBINode*>(T->fPositions->elementAt(px)); 646 if ((p->fType == RBBINode::leafChar) && (p->fVal == a)) { 647 if (U == nullptr) { 648 U = new UVector(*fStatus); 649 if (U == nullptr) { 650 *fStatus = U_MEMORY_ALLOCATION_ERROR; 651 goto ExitBuildSTdeleteall; 652 } 653 } 654 setAdd(U, p->fFollowPos); 655 } 656 } 657 658 // if U is not empty and not in DStates then 659 int32_t ux = 0; 660 UBool UinDstates = false; 661 if (U != nullptr) { 662 U_ASSERT(U->size() > 0); 663 int ix; 664 for (ix=0; ix<fDStates->size(); ix++) { 665 RBBIStateDescriptor *temp2; 666 temp2 = static_cast<RBBIStateDescriptor*>(fDStates->elementAt(ix)); 667 if (setEquals(U, temp2->fPositions)) { 668 delete U; 669 U = temp2->fPositions; 670 ux = ix; 671 UinDstates = true; 672 break; 673 } 674 } 675 676 // Add U as an unmarked state to Dstates 677 if (!UinDstates) 678 { 679 RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus); 680 if (newState == nullptr) { 681 *fStatus = U_MEMORY_ALLOCATION_ERROR; 682 } 683 if (U_FAILURE(*fStatus)) { 684 goto ExitBuildSTdeleteall; 685 } 686 newState->fPositions = U; 687 fDStates->addElement(newState, *fStatus); 688 if (U_FAILURE(*fStatus)) { 689 return; 690 } 691 ux = fDStates->size()-1; 692 } 693 694 // Dtran[T, a] := U; 695 T->fDtran->setElementAt(ux, a); 696 } 697 } 698 } 699 return; 700 // delete local pointers only if error occurred. 701 ExitBuildSTdeleteall: 702 delete initialState; 703 delete failState; 704 } 705 706 707 /** 708 * mapLookAheadRules 709 * 710 */ 711 void RBBITableBuilder::mapLookAheadRules() { 712 fLookAheadRuleMap = new UVector32(fRB->fScanner->numRules() + 1, *fStatus); 713 if (fLookAheadRuleMap == nullptr) { 714 *fStatus = U_MEMORY_ALLOCATION_ERROR; 715 } 716 if (U_FAILURE(*fStatus)) { 717 return; 718 } 719 fLookAheadRuleMap->setSize(fRB->fScanner->numRules() + 1); 720 721 for (int32_t n=0; n<fDStates->size(); n++) { 722 RBBIStateDescriptor* sd = static_cast<RBBIStateDescriptor*>(fDStates->elementAt(n)); 723 int32_t laSlotForState = 0; 724 725 // Establish the look-ahead slot for this state, if the state covers 726 // any look-ahead nodes - corresponding to the '/' in look-ahead rules. 727 728 // If any of the look-ahead nodes already have a slot assigned, use it, 729 // otherwise assign a new one. 730 731 bool sawLookAheadNode = false; 732 for (int32_t ipos=0; ipos<sd->fPositions->size(); ++ipos) { 733 RBBINode *node = static_cast<RBBINode *>(sd->fPositions->elementAt(ipos)); 734 if (node->fType != RBBINode::NodeType::lookAhead) { 735 continue; 736 } 737 sawLookAheadNode = true; 738 int32_t ruleNum = node->fVal; // Set when rule was originally parsed. 739 U_ASSERT(ruleNum < fLookAheadRuleMap->size()); 740 U_ASSERT(ruleNum > 0); 741 int32_t laSlot = fLookAheadRuleMap->elementAti(ruleNum); 742 if (laSlot != 0) { 743 if (laSlotForState == 0) { 744 laSlotForState = laSlot; 745 } else { 746 // TODO: figure out if this can fail, change to setting an error code if so. 747 U_ASSERT(laSlot == laSlotForState); 748 } 749 } 750 } 751 if (!sawLookAheadNode) { 752 continue; 753 } 754 755 if (laSlotForState == 0) { 756 laSlotForState = ++fLASlotsInUse; 757 } 758 759 // For each look ahead node covered by this state, 760 // set the mapping from the node's rule number to the look ahead slot. 761 // There can be multiple nodes/rule numbers going to the same la slot. 762 763 for (int32_t ipos=0; ipos<sd->fPositions->size(); ++ipos) { 764 RBBINode *node = static_cast<RBBINode *>(sd->fPositions->elementAt(ipos)); 765 if (node->fType != RBBINode::NodeType::lookAhead) { 766 continue; 767 } 768 int32_t ruleNum = node->fVal; // Set when rule was originally parsed. 769 int32_t existingVal = fLookAheadRuleMap->elementAti(ruleNum); 770 (void)existingVal; 771 U_ASSERT(existingVal == 0 || existingVal == laSlotForState); 772 fLookAheadRuleMap->setElementAt(laSlotForState, ruleNum); 773 } 774 } 775 776 } 777 778 //----------------------------------------------------------------------------- 779 // 780 // flagAcceptingStates Identify accepting states. 781 // First get a list of all of the end marker nodes. 782 // Then, for each state s, 783 // if s contains one of the end marker nodes in its list of tree positions then 784 // s is an accepting state. 785 // 786 //----------------------------------------------------------------------------- 787 void RBBITableBuilder::flagAcceptingStates() { 788 if (U_FAILURE(*fStatus)) { 789 return; 790 } 791 UVector endMarkerNodes(*fStatus); 792 RBBINode *endMarker; 793 int32_t i; 794 int32_t n; 795 796 if (U_FAILURE(*fStatus)) { 797 return; 798 } 799 800 fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus); 801 if (U_FAILURE(*fStatus)) { 802 return; 803 } 804 805 for (i=0; i<endMarkerNodes.size(); i++) { 806 endMarker = static_cast<RBBINode*>(endMarkerNodes.elementAt(i)); 807 for (n=0; n<fDStates->size(); n++) { 808 RBBIStateDescriptor* sd = static_cast<RBBIStateDescriptor*>(fDStates->elementAt(n)); 809 if (sd->fPositions->indexOf(endMarker) >= 0) { 810 // Any non-zero value for fAccepting means this is an accepting node. 811 // The value is what will be returned to the user as the break status. 812 // If no other value was specified, force it to ACCEPTING_UNCONDITIONAL (1). 813 814 if (sd->fAccepting==0) { 815 // State hasn't been marked as accepting yet. Do it now. 816 sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal); 817 if (sd->fAccepting == 0) { 818 sd->fAccepting = ACCEPTING_UNCONDITIONAL; 819 } 820 } 821 if (sd->fAccepting==ACCEPTING_UNCONDITIONAL && endMarker->fVal != 0) { 822 // Both lookahead and non-lookahead accepting for this state. 823 // Favor the look-ahead, because a look-ahead match needs to 824 // immediately stop the run-time engine. First match, not longest. 825 sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal); 826 } 827 // implicit else: 828 // if sd->fAccepting already had a value other than 0 or 1, leave it be. 829 } 830 } 831 } 832 } 833 834 835 //----------------------------------------------------------------------------- 836 // 837 // flagLookAheadStates Very similar to flagAcceptingStates, above. 838 // 839 //----------------------------------------------------------------------------- 840 void RBBITableBuilder::flagLookAheadStates() { 841 if (U_FAILURE(*fStatus)) { 842 return; 843 } 844 UVector lookAheadNodes(*fStatus); 845 RBBINode *lookAheadNode; 846 int32_t i; 847 int32_t n; 848 849 fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus); 850 if (U_FAILURE(*fStatus)) { 851 return; 852 } 853 for (i=0; i<lookAheadNodes.size(); i++) { 854 lookAheadNode = static_cast<RBBINode*>(lookAheadNodes.elementAt(i)); 855 U_ASSERT(lookAheadNode->fType == RBBINode::NodeType::lookAhead); 856 857 for (n=0; n<fDStates->size(); n++) { 858 RBBIStateDescriptor* sd = static_cast<RBBIStateDescriptor*>(fDStates->elementAt(n)); 859 int32_t positionsIdx = sd->fPositions->indexOf(lookAheadNode); 860 if (positionsIdx >= 0) { 861 U_ASSERT(lookAheadNode == sd->fPositions->elementAt(positionsIdx)); 862 uint32_t lookaheadSlot = fLookAheadRuleMap->elementAti(lookAheadNode->fVal); 863 U_ASSERT(sd->fLookAhead == 0 || sd->fLookAhead == lookaheadSlot); 864 // if (sd->fLookAhead != 0 && sd->fLookAhead != lookaheadSlot) { 865 // printf("%s:%d Bingo. sd->fLookAhead:%d lookaheadSlot:%d\n", 866 // __FILE__, __LINE__, sd->fLookAhead, lookaheadSlot); 867 // } 868 sd->fLookAhead = lookaheadSlot; 869 } 870 } 871 } 872 } 873 874 875 876 877 //----------------------------------------------------------------------------- 878 // 879 // flagTaggedStates 880 // 881 //----------------------------------------------------------------------------- 882 void RBBITableBuilder::flagTaggedStates() { 883 if (U_FAILURE(*fStatus)) { 884 return; 885 } 886 UVector tagNodes(*fStatus); 887 RBBINode *tagNode; 888 int32_t i; 889 int32_t n; 890 891 if (U_FAILURE(*fStatus)) { 892 return; 893 } 894 fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus); 895 if (U_FAILURE(*fStatus)) { 896 return; 897 } 898 for (i=0; i<tagNodes.size(); i++) { // For each tag node t (all of 'em) 899 tagNode = static_cast<RBBINode*>(tagNodes.elementAt(i)); 900 901 for (n=0; n<fDStates->size(); n++) { // For each state s (row in the state table) 902 RBBIStateDescriptor* sd = static_cast<RBBIStateDescriptor*>(fDStates->elementAt(n)); 903 if (sd->fPositions->indexOf(tagNode) >= 0) { // if s include the tag node t 904 sortedAdd(&sd->fTagVals, tagNode->fVal); 905 } 906 } 907 } 908 } 909 910 911 912 913 //----------------------------------------------------------------------------- 914 // 915 // mergeRuleStatusVals 916 // 917 // Update the global table of rule status {tag} values 918 // The rule builder has a global vector of status values that are common 919 // for all tables. Merge the ones from this table into the global set. 920 // 921 //----------------------------------------------------------------------------- 922 void RBBITableBuilder::mergeRuleStatusVals() { 923 // 924 // The basic outline of what happens here is this... 925 // 926 // for each state in this state table 927 // if the status tag list for this state is in the global statuses list 928 // record where and 929 // continue with the next state 930 // else 931 // add the tag list for this state to the global list. 932 // 933 int i; 934 int n; 935 936 // Pre-set a single tag of {0} into the table. 937 // We will need this as a default, for rule sets with no explicit tagging. 938 if (fRB->fRuleStatusVals->size() == 0) { 939 fRB->fRuleStatusVals->addElement(1, *fStatus); // Num of statuses in group 940 fRB->fRuleStatusVals->addElement(static_cast<int32_t>(0), *fStatus); // and our single status of zero 941 } 942 943 // For each state 944 for (n=0; n<fDStates->size(); n++) { 945 RBBIStateDescriptor* sd = static_cast<RBBIStateDescriptor*>(fDStates->elementAt(n)); 946 UVector *thisStatesTagValues = sd->fTagVals; 947 if (thisStatesTagValues == nullptr) { 948 // No tag values are explicitly associated with this state. 949 // Set the default tag value. 950 sd->fTagsIdx = 0; 951 continue; 952 } 953 954 // There are tag(s) associated with this state. 955 // fTagsIdx will be the index into the global tag list for this state's tag values. 956 // Initial value of -1 flags that we haven't got it set yet. 957 sd->fTagsIdx = -1; 958 int32_t thisTagGroupStart = 0; // indexes into the global rule status vals list 959 int32_t nextTagGroupStart = 0; 960 961 // Loop runs once per group of tags in the global list 962 while (nextTagGroupStart < fRB->fRuleStatusVals->size()) { 963 thisTagGroupStart = nextTagGroupStart; 964 nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1; 965 if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) { 966 // The number of tags for this state is different from 967 // the number of tags in this group from the global list. 968 // Continue with the next group from the global list. 969 continue; 970 } 971 // The lengths match, go ahead and compare the actual tag values 972 // between this state and the group from the global list. 973 for (i=0; i<thisStatesTagValues->size(); i++) { 974 if (thisStatesTagValues->elementAti(i) != 975 fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) { 976 // Mismatch. 977 break; 978 } 979 } 980 981 if (i == thisStatesTagValues->size()) { 982 // We found a set of tag values in the global list that match 983 // those for this state. Use them. 984 sd->fTagsIdx = thisTagGroupStart; 985 break; 986 } 987 } 988 989 if (sd->fTagsIdx == -1) { 990 // No suitable entry in the global tag list already. Add one 991 sd->fTagsIdx = fRB->fRuleStatusVals->size(); 992 fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus); 993 for (i=0; i<thisStatesTagValues->size(); i++) { 994 fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus); 995 } 996 } 997 } 998 } 999 1000 1001 1002 1003 1004 1005 1006 //----------------------------------------------------------------------------- 1007 // 1008 // sortedAdd Add a value to a vector of sorted values (ints). 1009 // Do not replicate entries; if the value is already there, do not 1010 // add a second one. 1011 // Lazily create the vector if it does not already exist. 1012 // 1013 //----------------------------------------------------------------------------- 1014 void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) { 1015 int32_t i; 1016 1017 if (*vector == nullptr) { 1018 *vector = new UVector(*fStatus); 1019 } 1020 if (*vector == nullptr || U_FAILURE(*fStatus)) { 1021 return; 1022 } 1023 UVector *vec = *vector; 1024 int32_t vSize = vec->size(); 1025 for (i=0; i<vSize; i++) { 1026 int32_t valAtI = vec->elementAti(i); 1027 if (valAtI == val) { 1028 // The value is already in the vector. Don't add it again. 1029 return; 1030 } 1031 if (valAtI > val) { 1032 break; 1033 } 1034 } 1035 vec->insertElementAt(val, i, *fStatus); 1036 } 1037 1038 1039 1040 //----------------------------------------------------------------------------- 1041 // 1042 // setAdd Set operation on UVector 1043 // dest = dest union source 1044 // Elements may only appear once and must be sorted. 1045 // 1046 //----------------------------------------------------------------------------- 1047 void RBBITableBuilder::setAdd(UVector *dest, UVector *source) { 1048 U_ASSERT(!dest->hasDeleter()); 1049 U_ASSERT(!source->hasDeleter()); 1050 int32_t destOriginalSize = dest->size(); 1051 int32_t sourceSize = source->size(); 1052 int32_t di = 0; 1053 MaybeStackArray<void *, 16> destArray, sourceArray; // Handle small cases without malloc 1054 void **destPtr, **sourcePtr; 1055 void **destLim, **sourceLim; 1056 1057 if (destOriginalSize > destArray.getCapacity()) { 1058 if (destArray.resize(destOriginalSize) == nullptr) { 1059 return; 1060 } 1061 } 1062 destPtr = destArray.getAlias(); 1063 destLim = destPtr + destOriginalSize; // destArray.getArrayLimit()? 1064 1065 if (sourceSize > sourceArray.getCapacity()) { 1066 if (sourceArray.resize(sourceSize) == nullptr) { 1067 return; 1068 } 1069 } 1070 sourcePtr = sourceArray.getAlias(); 1071 sourceLim = sourcePtr + sourceSize; // sourceArray.getArrayLimit()? 1072 1073 // Avoid multiple "get element" calls by getting the contents into arrays 1074 (void) dest->toArray(destPtr); 1075 (void) source->toArray(sourcePtr); 1076 1077 dest->setSize(sourceSize+destOriginalSize, *fStatus); 1078 if (U_FAILURE(*fStatus)) { 1079 return; 1080 } 1081 1082 while (sourcePtr < sourceLim && destPtr < destLim) { 1083 if (*destPtr == *sourcePtr) { 1084 dest->setElementAt(*sourcePtr++, di++); 1085 destPtr++; 1086 } 1087 // This check is required for machines with segmented memory, like i5/OS. 1088 // Direct pointer comparison is not recommended. 1089 else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) { 1090 dest->setElementAt(*destPtr++, di++); 1091 } 1092 else { /* *sourcePtr < *destPtr */ 1093 dest->setElementAt(*sourcePtr++, di++); 1094 } 1095 } 1096 1097 // At most one of these two cleanup loops will execute 1098 while (destPtr < destLim) { 1099 dest->setElementAt(*destPtr++, di++); 1100 } 1101 while (sourcePtr < sourceLim) { 1102 dest->setElementAt(*sourcePtr++, di++); 1103 } 1104 1105 dest->setSize(di, *fStatus); 1106 } 1107 1108 1109 1110 //----------------------------------------------------------------------------- 1111 // 1112 // setEqual Set operation on UVector. 1113 // Compare for equality. 1114 // Elements must be sorted. 1115 // 1116 //----------------------------------------------------------------------------- 1117 UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) { 1118 return a->equals(*b); 1119 } 1120 1121 1122 //----------------------------------------------------------------------------- 1123 // 1124 // printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos 1125 // for each node in the tree. 1126 // 1127 //----------------------------------------------------------------------------- 1128 #ifdef RBBI_DEBUG 1129 void RBBITableBuilder::printPosSets(RBBINode *n) { 1130 if (n==nullptr) { 1131 return; 1132 } 1133 printf("\n"); 1134 RBBINode::printNodeHeader(); 1135 RBBINode::printNode(n); 1136 RBBIDebugPrintf(" Nullable: %s\n", n->fNullable?"true":"false"); 1137 1138 RBBIDebugPrintf(" firstpos: "); 1139 printSet(n->fFirstPosSet); 1140 1141 RBBIDebugPrintf(" lastpos: "); 1142 printSet(n->fLastPosSet); 1143 1144 RBBIDebugPrintf(" followpos: "); 1145 printSet(n->fFollowPos); 1146 1147 printPosSets(n->fLeftChild); 1148 printPosSets(n->fRightChild); 1149 } 1150 #endif 1151 1152 // 1153 // findDuplCharClassFrom() 1154 // 1155 bool RBBITableBuilder::findDuplCharClassFrom(IntPair *categories) { 1156 int32_t numStates = fDStates->size(); 1157 int32_t numCols = fRB->fSetBuilder->getNumCharCategories(); 1158 1159 for (; categories->first < numCols-1; categories->first++) { 1160 // Note: dictionary & non-dictionary columns cannot be merged. 1161 // The limitSecond value prevents considering mixed pairs. 1162 // Dictionary categories are >= DictCategoriesStart. 1163 // Non dict categories are < DictCategoriesStart. 1164 int limitSecond = categories->first < fRB->fSetBuilder->getDictCategoriesStart() ? 1165 fRB->fSetBuilder->getDictCategoriesStart() : numCols; 1166 for (categories->second=categories->first+1; categories->second < limitSecond; categories->second++) { 1167 // Initialized to different values to prevent returning true if numStates = 0 (implies no duplicates). 1168 uint16_t table_base = 0; 1169 uint16_t table_dupl = 1; 1170 for (int32_t state=0; state<numStates; state++) { 1171 RBBIStateDescriptor* sd = static_cast<RBBIStateDescriptor*>(fDStates->elementAt(state)); 1172 table_base = static_cast<uint16_t>(sd->fDtran->elementAti(categories->first)); 1173 table_dupl = static_cast<uint16_t>(sd->fDtran->elementAti(categories->second)); 1174 if (table_base != table_dupl) { 1175 break; 1176 } 1177 } 1178 if (table_base == table_dupl) { 1179 return true; 1180 } 1181 } 1182 } 1183 return false; 1184 } 1185 1186 1187 // 1188 // removeColumn() 1189 // 1190 void RBBITableBuilder::removeColumn(int32_t column) { 1191 int32_t numStates = fDStates->size(); 1192 for (int32_t state=0; state<numStates; state++) { 1193 RBBIStateDescriptor* sd = static_cast<RBBIStateDescriptor*>(fDStates->elementAt(state)); 1194 U_ASSERT(column < sd->fDtran->size()); 1195 sd->fDtran->removeElementAt(column); 1196 } 1197 } 1198 1199 /* 1200 * findDuplicateState 1201 */ 1202 bool RBBITableBuilder::findDuplicateState(IntPair *states) { 1203 int32_t numStates = fDStates->size(); 1204 int32_t numCols = fRB->fSetBuilder->getNumCharCategories(); 1205 1206 for (; states->first<numStates-1; states->first++) { 1207 RBBIStateDescriptor* firstSD = static_cast<RBBIStateDescriptor*>(fDStates->elementAt(states->first)); 1208 for (states->second=states->first+1; states->second<numStates; states->second++) { 1209 RBBIStateDescriptor* duplSD = static_cast<RBBIStateDescriptor*>(fDStates->elementAt(states->second)); 1210 if (firstSD->fAccepting != duplSD->fAccepting || 1211 firstSD->fLookAhead != duplSD->fLookAhead || 1212 firstSD->fTagsIdx != duplSD->fTagsIdx) { 1213 continue; 1214 } 1215 bool rowsMatch = true; 1216 for (int32_t col=0; col < numCols; ++col) { 1217 int32_t firstVal = firstSD->fDtran->elementAti(col); 1218 int32_t duplVal = duplSD->fDtran->elementAti(col); 1219 if (!((firstVal == duplVal) || 1220 ((firstVal == states->first || firstVal == states->second) && 1221 (duplVal == states->first || duplVal == states->second)))) { 1222 rowsMatch = false; 1223 break; 1224 } 1225 } 1226 if (rowsMatch) { 1227 return true; 1228 } 1229 } 1230 } 1231 return false; 1232 } 1233 1234 1235 bool RBBITableBuilder::findDuplicateSafeState(IntPair *states) { 1236 int32_t numStates = fSafeTable->size(); 1237 1238 for (; states->first<numStates-1; states->first++) { 1239 UnicodeString *firstRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->first)); 1240 for (states->second=states->first+1; states->second<numStates; states->second++) { 1241 UnicodeString *duplRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->second)); 1242 bool rowsMatch = true; 1243 int32_t numCols = firstRow->length(); 1244 for (int32_t col=0; col < numCols; ++col) { 1245 int32_t firstVal = firstRow->charAt(col); 1246 int32_t duplVal = duplRow->charAt(col); 1247 if (!((firstVal == duplVal) || 1248 ((firstVal == states->first || firstVal == states->second) && 1249 (duplVal == states->first || duplVal == states->second)))) { 1250 rowsMatch = false; 1251 break; 1252 } 1253 } 1254 if (rowsMatch) { 1255 return true; 1256 } 1257 } 1258 } 1259 return false; 1260 } 1261 1262 1263 void RBBITableBuilder::removeState(IntPair duplStates) { 1264 const int32_t keepState = duplStates.first; 1265 const int32_t duplState = duplStates.second; 1266 U_ASSERT(keepState < duplState); 1267 U_ASSERT(duplState < fDStates->size()); 1268 1269 RBBIStateDescriptor* duplSD = static_cast<RBBIStateDescriptor*>(fDStates->elementAt(duplState)); 1270 fDStates->removeElementAt(duplState); 1271 delete duplSD; 1272 1273 int32_t numStates = fDStates->size(); 1274 int32_t numCols = fRB->fSetBuilder->getNumCharCategories(); 1275 for (int32_t state=0; state<numStates; ++state) { 1276 RBBIStateDescriptor* sd = static_cast<RBBIStateDescriptor*>(fDStates->elementAt(state)); 1277 for (int32_t col=0; col<numCols; col++) { 1278 int32_t existingVal = sd->fDtran->elementAti(col); 1279 int32_t newVal = existingVal; 1280 if (existingVal == duplState) { 1281 newVal = keepState; 1282 } else if (existingVal > duplState) { 1283 newVal = existingVal - 1; 1284 } 1285 sd->fDtran->setElementAt(newVal, col); 1286 } 1287 } 1288 } 1289 1290 void RBBITableBuilder::removeSafeState(IntPair duplStates) { 1291 const int32_t keepState = duplStates.first; 1292 const int32_t duplState = duplStates.second; 1293 U_ASSERT(keepState < duplState); 1294 U_ASSERT(duplState < fSafeTable->size()); 1295 1296 fSafeTable->removeElementAt(duplState); // Note that fSafeTable has a deleter function 1297 // and will auto-delete the removed element. 1298 int32_t numStates = fSafeTable->size(); 1299 for (int32_t state=0; state<numStates; ++state) { 1300 UnicodeString* sd = static_cast<UnicodeString*>(fSafeTable->elementAt(state)); 1301 int32_t numCols = sd->length(); 1302 for (int32_t col=0; col<numCols; col++) { 1303 int32_t existingVal = sd->charAt(col); 1304 int32_t newVal = existingVal; 1305 if (existingVal == duplState) { 1306 newVal = keepState; 1307 } else if (existingVal > duplState) { 1308 newVal = existingVal - 1; 1309 } 1310 sd->setCharAt(col, static_cast<char16_t>(newVal)); 1311 } 1312 } 1313 } 1314 1315 1316 /* 1317 * RemoveDuplicateStates 1318 */ 1319 int32_t RBBITableBuilder::removeDuplicateStates() { 1320 IntPair dupls = {3, 0}; 1321 int32_t numStatesRemoved = 0; 1322 1323 while (findDuplicateState(&dupls)) { 1324 // printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second); 1325 removeState(dupls); 1326 ++numStatesRemoved; 1327 } 1328 return numStatesRemoved; 1329 } 1330 1331 1332 //----------------------------------------------------------------------------- 1333 // 1334 // getTableSize() Calculate the size of the runtime form of this 1335 // state transition table. 1336 // 1337 //----------------------------------------------------------------------------- 1338 int32_t RBBITableBuilder::getTableSize() const { 1339 int32_t size = 0; 1340 int32_t numRows; 1341 int32_t numCols; 1342 int32_t rowSize; 1343 1344 if (fTree == nullptr) { 1345 return 0; 1346 } 1347 1348 size = offsetof(RBBIStateTable, fTableData); // The header, with no rows to the table. 1349 1350 numRows = fDStates->size(); 1351 numCols = fRB->fSetBuilder->getNumCharCategories(); 1352 1353 if (use8BitsForTable()) { 1354 rowSize = offsetof(RBBIStateTableRow8, fNextState) + sizeof(int8_t)*numCols; 1355 } else { 1356 rowSize = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t)*numCols; 1357 } 1358 size += numRows * rowSize; 1359 return size; 1360 } 1361 1362 bool RBBITableBuilder::use8BitsForTable() const { 1363 return fDStates->size() <= kMaxStateFor8BitsTable; 1364 } 1365 1366 //----------------------------------------------------------------------------- 1367 // 1368 // exportTable() export the state transition table in the format required 1369 // by the runtime engine. getTableSize() bytes of memory 1370 // must be available at the output address "where". 1371 // 1372 //----------------------------------------------------------------------------- 1373 void RBBITableBuilder::exportTable(void *where) { 1374 RBBIStateTable* table = static_cast<RBBIStateTable*>(where); 1375 uint32_t state; 1376 int col; 1377 1378 if (U_FAILURE(*fStatus) || fTree == nullptr) { 1379 return; 1380 } 1381 1382 int32_t catCount = fRB->fSetBuilder->getNumCharCategories(); 1383 if (catCount > 0x7fff || 1384 fDStates->size() > 0x7fff) { 1385 *fStatus = U_BRK_INTERNAL_ERROR; 1386 return; 1387 } 1388 1389 table->fNumStates = fDStates->size(); 1390 table->fDictCategoriesStart = fRB->fSetBuilder->getDictCategoriesStart(); 1391 table->fLookAheadResultsSize = fLASlotsInUse == ACCEPTING_UNCONDITIONAL ? 0 : fLASlotsInUse + 1; 1392 table->fFlags = 0; 1393 if (use8BitsForTable()) { 1394 table->fRowLen = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount; 1395 table->fFlags |= RBBI_8BITS_ROWS; 1396 } else { 1397 table->fRowLen = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t) * catCount; 1398 } 1399 if (fRB->fLookAheadHardBreak) { 1400 table->fFlags |= RBBI_LOOKAHEAD_HARD_BREAK; 1401 } 1402 if (fRB->fSetBuilder->sawBOF()) { 1403 table->fFlags |= RBBI_BOF_REQUIRED; 1404 } 1405 1406 for (state=0; state<table->fNumStates; state++) { 1407 RBBIStateDescriptor* sd = static_cast<RBBIStateDescriptor*>(fDStates->elementAt(state)); 1408 RBBIStateTableRow* row = reinterpret_cast<RBBIStateTableRow*>(table->fTableData + state * table->fRowLen); 1409 if (use8BitsForTable()) { 1410 U_ASSERT (sd->fAccepting <= 255); 1411 U_ASSERT (sd->fLookAhead <= 255); 1412 U_ASSERT (0 <= sd->fTagsIdx && sd->fTagsIdx <= 255); 1413 RBBIStateTableRow8* r8 = reinterpret_cast<RBBIStateTableRow8*>(row); 1414 r8->fAccepting = sd->fAccepting; 1415 r8->fLookAhead = sd->fLookAhead; 1416 r8->fTagsIdx = sd->fTagsIdx; 1417 for (col=0; col<catCount; col++) { 1418 U_ASSERT (sd->fDtran->elementAti(col) <= kMaxStateFor8BitsTable); 1419 r8->fNextState[col] = sd->fDtran->elementAti(col); 1420 } 1421 } else { 1422 U_ASSERT (sd->fAccepting <= 0xffff); 1423 U_ASSERT (sd->fLookAhead <= 0xffff); 1424 U_ASSERT (0 <= sd->fTagsIdx && sd->fTagsIdx <= 0xffff); 1425 row->r16.fAccepting = sd->fAccepting; 1426 row->r16.fLookAhead = sd->fLookAhead; 1427 row->r16.fTagsIdx = sd->fTagsIdx; 1428 for (col=0; col<catCount; col++) { 1429 row->r16.fNextState[col] = sd->fDtran->elementAti(col); 1430 } 1431 } 1432 } 1433 } 1434 1435 1436 /** 1437 * Synthesize a safe state table from the main state table. 1438 */ 1439 void RBBITableBuilder::buildSafeReverseTable(UErrorCode &status) { 1440 // The safe table creation has three steps: 1441 1442 // 1. Identify pairs of character classes that are "safe." Safe means that boundaries 1443 // following the pair do not depend on context or state before the pair. To test 1444 // whether a pair is safe, run it through the main forward state table, starting 1445 // from each state. If the final state is the same, no matter what the starting state, 1446 // the pair is safe. 1447 // 1448 // 2. Build a state table that recognizes the safe pairs. It's similar to their 1449 // forward table, with a column for each input character [class], and a row for 1450 // each state. Row 1 is the start state, and row 0 is the stop state. Initially 1451 // create an additional state for each input character category; being in 1452 // one of these states means that the character has been seen, and is potentially 1453 // the first of a pair. In each of these rows, the entry for the second character 1454 // of a safe pair is set to the stop state (0), indicating that a match was found. 1455 // All other table entries are set to the state corresponding the current input 1456 // character, allowing that character to be the of a start following pair. 1457 // 1458 // Because the safe rules are to be run in reverse, moving backwards in the text, 1459 // the first and second pair categories are swapped when building the table. 1460 // 1461 // 3. Compress the table. There are typically many rows (states) that are 1462 // equivalent - that have zeroes (match completed) in the same columns - 1463 // and can be folded together. 1464 1465 // Each safe pair is stored as two UChars in the safePair string. 1466 UnicodeString safePairs; 1467 1468 int32_t numCharClasses = fRB->fSetBuilder->getNumCharCategories(); 1469 int32_t numStates = fDStates->size(); 1470 1471 for (int32_t c1=0; c1<numCharClasses; ++c1) { 1472 for (int32_t c2=0; c2 < numCharClasses; ++c2) { 1473 int32_t wantedEndState = -1; 1474 int32_t endState = 0; 1475 for (int32_t startState = 1; startState < numStates; ++startState) { 1476 RBBIStateDescriptor *startStateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(startState)); 1477 int32_t s2 = startStateD->fDtran->elementAti(c1); 1478 RBBIStateDescriptor *s2StateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(s2)); 1479 endState = s2StateD->fDtran->elementAti(c2); 1480 if (wantedEndState < 0) { 1481 wantedEndState = endState; 1482 } else { 1483 if (wantedEndState != endState) { 1484 break; 1485 } 1486 } 1487 } 1488 if (wantedEndState == endState) { 1489 safePairs.append(static_cast<char16_t>(c1)); 1490 safePairs.append(static_cast<char16_t>(c2)); 1491 // printf("(%d, %d) ", c1, c2); 1492 } 1493 } 1494 // printf("\n"); 1495 } 1496 1497 // Populate the initial safe table. 1498 // The table as a whole is UVector<UnicodeString> 1499 // Each row is represented by a UnicodeString, being used as a Vector<int16>. 1500 // Row 0 is the stop state. 1501 // Row 1 is the start state. 1502 // Row 2 and beyond are other states, initially one per char class, but 1503 // after initial construction, many of the states will be combined, compacting the table. 1504 // The String holds the nextState data only. The four leading fields of a row, fAccepting, 1505 // fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building. 1506 1507 U_ASSERT(fSafeTable == nullptr); 1508 LocalPointer<UVector> lpSafeTable( 1509 new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status), status); 1510 if (U_FAILURE(status)) { 1511 return; 1512 } 1513 fSafeTable = lpSafeTable.orphan(); 1514 for (int32_t row=0; row<numCharClasses + 2; ++row) { 1515 LocalPointer<UnicodeString> lpString(new UnicodeString(numCharClasses, 0, numCharClasses+4), status); 1516 fSafeTable->adoptElement(lpString.orphan(), status); 1517 } 1518 if (U_FAILURE(status)) { 1519 return; 1520 } 1521 1522 // From the start state, each input char class transitions to the state for that input. 1523 UnicodeString &startState = *static_cast<UnicodeString *>(fSafeTable->elementAt(1)); 1524 for (int32_t charClass=0; charClass < numCharClasses; ++charClass) { 1525 // Note: +2 for the start & stop state. 1526 startState.setCharAt(charClass, static_cast<char16_t>(charClass+2)); 1527 } 1528 1529 // Initially make every other state table row look like the start state row, 1530 for (int32_t row=2; row<numCharClasses+2; ++row) { 1531 UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(row)); 1532 rowState = startState; // UnicodeString assignment, copies contents. 1533 } 1534 1535 // Run through the safe pairs, set the next state to zero when pair has been seen. 1536 // Zero being the stop state, meaning we found a safe point. 1537 for (int32_t pairIdx=0; pairIdx<safePairs.length(); pairIdx+=2) { 1538 int32_t c1 = safePairs.charAt(pairIdx); 1539 int32_t c2 = safePairs.charAt(pairIdx + 1); 1540 1541 UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(c2 + 2)); 1542 rowState.setCharAt(c1, 0); 1543 } 1544 1545 // Remove duplicate or redundant rows from the table. 1546 IntPair states = {1, 0}; 1547 while (findDuplicateSafeState(&states)) { 1548 // printf("Removing duplicate safe states (%d, %d)\n", states.first, states.second); 1549 removeSafeState(states); 1550 } 1551 } 1552 1553 1554 //----------------------------------------------------------------------------- 1555 // 1556 // getSafeTableSize() Calculate the size of the runtime form of this 1557 // safe state table. 1558 // 1559 //----------------------------------------------------------------------------- 1560 int32_t RBBITableBuilder::getSafeTableSize() const { 1561 int32_t size = 0; 1562 int32_t numRows; 1563 int32_t numCols; 1564 int32_t rowSize; 1565 1566 if (fSafeTable == nullptr) { 1567 return 0; 1568 } 1569 1570 size = offsetof(RBBIStateTable, fTableData); // The header, with no rows to the table. 1571 1572 numRows = fSafeTable->size(); 1573 numCols = fRB->fSetBuilder->getNumCharCategories(); 1574 1575 if (use8BitsForSafeTable()) { 1576 rowSize = offsetof(RBBIStateTableRow8, fNextState) + sizeof(int8_t)*numCols; 1577 } else { 1578 rowSize = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t)*numCols; 1579 } 1580 size += numRows * rowSize; 1581 return size; 1582 } 1583 1584 bool RBBITableBuilder::use8BitsForSafeTable() const { 1585 return fSafeTable->size() <= kMaxStateFor8BitsTable; 1586 } 1587 1588 //----------------------------------------------------------------------------- 1589 // 1590 // exportSafeTable() export the state transition table in the format required 1591 // by the runtime engine. getTableSize() bytes of memory 1592 // must be available at the output address "where". 1593 // 1594 //----------------------------------------------------------------------------- 1595 void RBBITableBuilder::exportSafeTable(void *where) { 1596 RBBIStateTable* table = static_cast<RBBIStateTable*>(where); 1597 uint32_t state; 1598 int col; 1599 1600 if (U_FAILURE(*fStatus) || fSafeTable == nullptr) { 1601 return; 1602 } 1603 1604 int32_t catCount = fRB->fSetBuilder->getNumCharCategories(); 1605 if (catCount > 0x7fff || 1606 fSafeTable->size() > 0x7fff) { 1607 *fStatus = U_BRK_INTERNAL_ERROR; 1608 return; 1609 } 1610 1611 table->fNumStates = fSafeTable->size(); 1612 table->fFlags = 0; 1613 if (use8BitsForSafeTable()) { 1614 table->fRowLen = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount; 1615 table->fFlags |= RBBI_8BITS_ROWS; 1616 } else { 1617 table->fRowLen = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t) * catCount; 1618 } 1619 1620 for (state=0; state<table->fNumStates; state++) { 1621 UnicodeString* rowString = static_cast<UnicodeString*>(fSafeTable->elementAt(state)); 1622 RBBIStateTableRow* row = reinterpret_cast<RBBIStateTableRow*>(table->fTableData + state * table->fRowLen); 1623 if (use8BitsForSafeTable()) { 1624 RBBIStateTableRow8* r8 = reinterpret_cast<RBBIStateTableRow8*>(row); 1625 r8->fAccepting = 0; 1626 r8->fLookAhead = 0; 1627 r8->fTagsIdx = 0; 1628 for (col=0; col<catCount; col++) { 1629 U_ASSERT(rowString->charAt(col) <= kMaxStateFor8BitsTable); 1630 r8->fNextState[col] = static_cast<uint8_t>(rowString->charAt(col)); 1631 } 1632 } else { 1633 row->r16.fAccepting = 0; 1634 row->r16.fLookAhead = 0; 1635 row->r16.fTagsIdx = 0; 1636 for (col=0; col<catCount; col++) { 1637 row->r16.fNextState[col] = rowString->charAt(col); 1638 } 1639 } 1640 } 1641 } 1642 1643 1644 1645 1646 //----------------------------------------------------------------------------- 1647 // 1648 // printSet Debug function. Print the contents of a UVector 1649 // 1650 //----------------------------------------------------------------------------- 1651 #ifdef RBBI_DEBUG 1652 void RBBITableBuilder::printSet(UVector *s) { 1653 int32_t i; 1654 for (i=0; i<s->size(); i++) { 1655 const RBBINode *v = static_cast<const RBBINode *>(s->elementAt(i)); 1656 RBBIDebugPrintf("%5d", v==nullptr? -1 : v->fSerialNum); 1657 } 1658 RBBIDebugPrintf("\n"); 1659 } 1660 #endif 1661 1662 1663 //----------------------------------------------------------------------------- 1664 // 1665 // printStates Debug Function. Dump the fully constructed state transition table. 1666 // 1667 //----------------------------------------------------------------------------- 1668 #ifdef RBBI_DEBUG 1669 void RBBITableBuilder::printStates() { 1670 int c; // input "character" 1671 int n; // state number 1672 1673 RBBIDebugPrintf("state | i n p u t s y m b o l s \n"); 1674 RBBIDebugPrintf(" | Acc LA Tag"); 1675 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) { 1676 RBBIDebugPrintf(" %3d", c); 1677 } 1678 RBBIDebugPrintf("\n"); 1679 RBBIDebugPrintf(" |---------------"); 1680 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) { 1681 RBBIDebugPrintf("----"); 1682 } 1683 RBBIDebugPrintf("\n"); 1684 1685 for (n=0; n<fDStates->size(); n++) { 1686 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); 1687 RBBIDebugPrintf(" %3d | " , n); 1688 RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx); 1689 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) { 1690 RBBIDebugPrintf(" %3d", sd->fDtran->elementAti(c)); 1691 } 1692 RBBIDebugPrintf("\n"); 1693 } 1694 RBBIDebugPrintf("\n\n"); 1695 } 1696 #endif 1697 1698 1699 //----------------------------------------------------------------------------- 1700 // 1701 // printSafeTable Debug Function. Dump the fully constructed safe table. 1702 // 1703 //----------------------------------------------------------------------------- 1704 #ifdef RBBI_DEBUG 1705 void RBBITableBuilder::printReverseTable() { 1706 int c; // input "character" 1707 int n; // state number 1708 1709 RBBIDebugPrintf(" Safe Reverse Table \n"); 1710 if (fSafeTable == nullptr) { 1711 RBBIDebugPrintf(" --- nullptr ---\n"); 1712 return; 1713 } 1714 RBBIDebugPrintf("state | i n p u t s y m b o l s \n"); 1715 RBBIDebugPrintf(" | Acc LA Tag"); 1716 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) { 1717 RBBIDebugPrintf(" %2d", c); 1718 } 1719 RBBIDebugPrintf("\n"); 1720 RBBIDebugPrintf(" |---------------"); 1721 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) { 1722 RBBIDebugPrintf("---"); 1723 } 1724 RBBIDebugPrintf("\n"); 1725 1726 for (n=0; n<fSafeTable->size(); n++) { 1727 UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(n); 1728 RBBIDebugPrintf(" %3d | " , n); 1729 RBBIDebugPrintf("%3d %3d %5d ", 0, 0, 0); // Accepting, LookAhead, Tags 1730 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) { 1731 RBBIDebugPrintf(" %2d", rowString->charAt(c)); 1732 } 1733 RBBIDebugPrintf("\n"); 1734 } 1735 RBBIDebugPrintf("\n\n"); 1736 } 1737 #endif 1738 1739 1740 1741 //----------------------------------------------------------------------------- 1742 // 1743 // printRuleStatusTable Debug Function. Dump the common rule status table 1744 // 1745 //----------------------------------------------------------------------------- 1746 #ifdef RBBI_DEBUG 1747 void RBBITableBuilder::printRuleStatusTable() { 1748 int32_t thisRecord = 0; 1749 int32_t nextRecord = 0; 1750 int i; 1751 UVector *tbl = fRB->fRuleStatusVals; 1752 1753 RBBIDebugPrintf("index | tags \n"); 1754 RBBIDebugPrintf("-------------------\n"); 1755 1756 while (nextRecord < tbl->size()) { 1757 thisRecord = nextRecord; 1758 nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1; 1759 RBBIDebugPrintf("%4d ", thisRecord); 1760 for (i=thisRecord+1; i<nextRecord; i++) { 1761 RBBIDebugPrintf(" %5d", tbl->elementAti(i)); 1762 } 1763 RBBIDebugPrintf("\n"); 1764 } 1765 RBBIDebugPrintf("\n\n"); 1766 } 1767 #endif 1768 1769 1770 //----------------------------------------------------------------------------- 1771 // 1772 // RBBIStateDescriptor Methods. This is a very struct-like class 1773 // Most access is directly to the fields. 1774 // 1775 //----------------------------------------------------------------------------- 1776 1777 RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) { 1778 fMarked = false; 1779 fAccepting = 0; 1780 fLookAhead = 0; 1781 fTagsIdx = 0; 1782 fTagVals = nullptr; 1783 fPositions = nullptr; 1784 fDtran = nullptr; 1785 1786 fDtran = new UVector32(lastInputSymbol+1, *fStatus); 1787 if (U_FAILURE(*fStatus)) { 1788 return; 1789 } 1790 if (fDtran == nullptr) { 1791 *fStatus = U_MEMORY_ALLOCATION_ERROR; 1792 return; 1793 } 1794 fDtran->setSize(lastInputSymbol+1); // fDtran needs to be pre-sized. 1795 // It is indexed by input symbols, and will 1796 // hold the next state number for each 1797 // symbol. 1798 } 1799 1800 1801 RBBIStateDescriptor::~RBBIStateDescriptor() { 1802 delete fPositions; 1803 delete fDtran; 1804 delete fTagVals; 1805 fPositions = nullptr; 1806 fDtran = nullptr; 1807 fTagVals = nullptr; 1808 } 1809 1810 U_NAMESPACE_END 1811 1812 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */