rbbi.cpp (44554B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 *************************************************************************** 5 * Copyright (C) 1999-2016 International Business Machines Corporation 6 * and others. All rights reserved. 7 *************************************************************************** 8 */ 9 // 10 // file: rbbi.cpp Contains the implementation of the rule based break iterator 11 // runtime engine and the API implementation for 12 // class RuleBasedBreakIterator 13 // 14 15 #include "utypeinfo.h" // for 'typeid' to work 16 17 #include "unicode/utypes.h" 18 19 #if !UCONFIG_NO_BREAK_ITERATION 20 21 #include <cinttypes> 22 23 #include "unicode/rbbi.h" 24 #include "unicode/schriter.h" 25 #include "unicode/uchriter.h" 26 #include "unicode/uclean.h" 27 #include "unicode/udata.h" 28 29 #include "brkeng.h" 30 #include "ucln_cmn.h" 31 #include "cmemory.h" 32 #include "cstring.h" 33 #include "localsvc.h" 34 #include "rbbidata.h" 35 #include "rbbi_cache.h" 36 #include "rbbirb.h" 37 #include "uassert.h" 38 #include "umutex.h" 39 #include "uvectr32.h" 40 41 #ifdef RBBI_DEBUG 42 static UBool gTrace = false; 43 #endif 44 45 U_NAMESPACE_BEGIN 46 47 // The state number of the starting state 48 constexpr int32_t START_STATE = 1; 49 50 // The state-transition value indicating "stop" 51 constexpr int32_t STOP_STATE = 0; 52 53 54 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator) 55 56 57 //======================================================================= 58 // constructors 59 //======================================================================= 60 61 /** 62 * Constructs a RuleBasedBreakIterator that uses the already-created 63 * tables object that is passed in as a parameter. 64 */ 65 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status) 66 : RuleBasedBreakIterator(&status) 67 { 68 fData = new RBBIDataWrapper(data, status); // status checked in constructor 69 if (U_FAILURE(status)) {return;} 70 if(fData == nullptr) { 71 status = U_MEMORY_ALLOCATION_ERROR; 72 return; 73 } 74 if (fData->fForwardTable->fLookAheadResultsSize > 0) { 75 fLookAheadMatches = static_cast<int32_t *>( 76 uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t))); 77 if (fLookAheadMatches == nullptr) { 78 status = U_MEMORY_ALLOCATION_ERROR; 79 return; 80 } 81 } 82 } 83 84 //------------------------------------------------------------------------------- 85 // 86 // Constructor from a UDataMemory handle to precompiled break rules 87 // stored in an ICU data file. This construcotr is private API, 88 // only for internal use. 89 // 90 //------------------------------------------------------------------------------- 91 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UBool isPhraseBreaking, 92 UErrorCode &status) : RuleBasedBreakIterator(udm, status) 93 { 94 fIsPhraseBreaking = isPhraseBreaking; 95 } 96 97 // 98 // Construct from precompiled binary rules (tables). This constructor is public API, 99 // taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules(). 100 // 101 RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules, 102 uint32_t ruleLength, 103 UErrorCode &status) 104 : RuleBasedBreakIterator(&status) 105 { 106 if (U_FAILURE(status)) { 107 return; 108 } 109 if (compiledRules == nullptr || ruleLength < sizeof(RBBIDataHeader)) { 110 status = U_ILLEGAL_ARGUMENT_ERROR; 111 return; 112 } 113 const RBBIDataHeader* data = reinterpret_cast<const RBBIDataHeader*>(compiledRules); 114 if (data->fLength > ruleLength) { 115 status = U_ILLEGAL_ARGUMENT_ERROR; 116 return; 117 } 118 fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); 119 if (U_FAILURE(status)) {return;} 120 if(fData == nullptr) { 121 status = U_MEMORY_ALLOCATION_ERROR; 122 return; 123 } 124 if (fData->fForwardTable->fLookAheadResultsSize > 0) { 125 fLookAheadMatches = static_cast<int32_t *>( 126 uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t))); 127 if (fLookAheadMatches == nullptr) { 128 status = U_MEMORY_ALLOCATION_ERROR; 129 return; 130 } 131 } 132 } 133 134 135 //------------------------------------------------------------------------------- 136 // 137 // Constructor from a UDataMemory handle to precompiled break rules 138 // stored in an ICU data file. 139 // 140 //------------------------------------------------------------------------------- 141 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status) 142 : RuleBasedBreakIterator(&status) 143 { 144 fData = new RBBIDataWrapper(udm, status); // status checked in constructor 145 if (U_FAILURE(status)) {return;} 146 if(fData == nullptr) { 147 status = U_MEMORY_ALLOCATION_ERROR; 148 return; 149 } 150 if (fData->fForwardTable->fLookAheadResultsSize > 0) { 151 fLookAheadMatches = static_cast<int32_t *>( 152 uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t))); 153 if (fLookAheadMatches == nullptr) { 154 status = U_MEMORY_ALLOCATION_ERROR; 155 return; 156 } 157 } 158 } 159 160 161 162 //------------------------------------------------------------------------------- 163 // 164 // Constructor from a set of rules supplied as a string. 165 // 166 //------------------------------------------------------------------------------- 167 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules, 168 UParseError &parseError, 169 UErrorCode &status) 170 : RuleBasedBreakIterator(&status) 171 { 172 if (U_FAILURE(status)) {return;} 173 RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *) 174 RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status); 175 // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that 176 // creates and returns a complete RBBI. From here, in a constructor, we 177 // can't just return the object created by the builder factory, hence 178 // the assignment of the factory created object to "this". 179 if (U_SUCCESS(status)) { 180 *this = *bi; 181 delete bi; 182 } 183 } 184 185 186 //------------------------------------------------------------------------------- 187 // 188 // Default Constructor. Create an empty shell that can be set up later. 189 // Used when creating a RuleBasedBreakIterator from a set 190 // of rules. 191 //------------------------------------------------------------------------------- 192 RuleBasedBreakIterator::RuleBasedBreakIterator() 193 : RuleBasedBreakIterator(nullptr) 194 { 195 } 196 197 /** 198 * Simple Constructor with an error code. 199 * Handles common initialization for all other constructors. 200 */ 201 RuleBasedBreakIterator::RuleBasedBreakIterator(UErrorCode *status) { 202 UErrorCode ec = U_ZERO_ERROR; 203 if (status == nullptr) { 204 status = &ec; 205 } 206 utext_openUChars(&fText, nullptr, 0, status); 207 LocalPointer<DictionaryCache> lpDictionaryCache(new DictionaryCache(this, *status), *status); 208 LocalPointer<BreakCache> lpBreakCache(new BreakCache(this, *status), *status); 209 if (U_FAILURE(*status)) { 210 fErrorCode = *status; 211 return; 212 } 213 fDictionaryCache = lpDictionaryCache.orphan(); 214 fBreakCache = lpBreakCache.orphan(); 215 216 #ifdef RBBI_DEBUG 217 static UBool debugInitDone = false; 218 if (debugInitDone == false) { 219 char *debugEnv = getenv("U_RBBIDEBUG"); 220 if (debugEnv && uprv_strstr(debugEnv, "trace")) { 221 gTrace = true; 222 } 223 debugInitDone = true; 224 } 225 #endif 226 } 227 228 229 //------------------------------------------------------------------------------- 230 // 231 // Copy constructor. Will produce a break iterator with the same behavior, 232 // and which iterates over the same text, as the one passed in. 233 // 234 //------------------------------------------------------------------------------- 235 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other) 236 : RuleBasedBreakIterator() 237 { 238 *this = other; 239 } 240 241 242 /** 243 * Destructor 244 */ 245 RuleBasedBreakIterator::~RuleBasedBreakIterator() { 246 if (fCharIter != &fSCharIter) { 247 // fCharIter was adopted from the outside. 248 delete fCharIter; 249 } 250 fCharIter = nullptr; 251 252 utext_close(&fText); 253 254 if (fData != nullptr) { 255 fData->removeReference(); 256 fData = nullptr; 257 } 258 delete fBreakCache; 259 fBreakCache = nullptr; 260 261 delete fDictionaryCache; 262 fDictionaryCache = nullptr; 263 264 delete fLanguageBreakEngines; 265 fLanguageBreakEngines = nullptr; 266 267 delete fUnhandledBreakEngine; 268 fUnhandledBreakEngine = nullptr; 269 270 uprv_free(fLookAheadMatches); 271 fLookAheadMatches = nullptr; 272 } 273 274 /** 275 * Assignment operator. Sets this iterator to have the same behavior, 276 * and iterate over the same text, as the one passed in. 277 * TODO: needs better handling of memory allocation errors. 278 */ 279 RuleBasedBreakIterator& 280 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { 281 if (this == &that) { 282 return *this; 283 } 284 BreakIterator::operator=(that); 285 286 if (fLanguageBreakEngines != nullptr) { 287 delete fLanguageBreakEngines; 288 fLanguageBreakEngines = nullptr; // Just rebuild for now 289 } 290 // TODO: clone fLanguageBreakEngines from "that" 291 UErrorCode status = U_ZERO_ERROR; 292 utext_clone(&fText, &that.fText, false, true, &status); 293 294 if (fCharIter != &fSCharIter) { 295 delete fCharIter; 296 } 297 fCharIter = &fSCharIter; 298 299 if (that.fCharIter != nullptr && that.fCharIter != &that.fSCharIter) { 300 // This is a little bit tricky - it will initially appear that 301 // this->fCharIter is adopted, even if that->fCharIter was 302 // not adopted. That's ok. 303 fCharIter = that.fCharIter->clone(); 304 } 305 fSCharIter = that.fSCharIter; 306 if (fCharIter == nullptr) { 307 fCharIter = &fSCharIter; 308 } 309 310 if (fData != nullptr) { 311 fData->removeReference(); 312 fData = nullptr; 313 } 314 if (that.fData != nullptr) { 315 fData = that.fData->addReference(); 316 } 317 318 uprv_free(fLookAheadMatches); 319 fLookAheadMatches = nullptr; 320 if (fData && fData->fForwardTable->fLookAheadResultsSize > 0) { 321 fLookAheadMatches = static_cast<int32_t *>( 322 uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t))); 323 } 324 325 326 fPosition = that.fPosition; 327 fRuleStatusIndex = that.fRuleStatusIndex; 328 fDone = that.fDone; 329 330 // TODO: both the dictionary and the main cache need to be copied. 331 // Current position could be within a dictionary range. Trying to continue 332 // the iteration without the caches present would go to the rules, with 333 // the assumption that the current position is on a rule boundary. 334 fBreakCache->reset(fPosition, fRuleStatusIndex); 335 fDictionaryCache->reset(); 336 337 return *this; 338 } 339 340 //----------------------------------------------------------------------------- 341 // 342 // clone - Returns a newly-constructed RuleBasedBreakIterator with the same 343 // behavior, and iterating over the same text, as this one. 344 // Virtual function: does the right thing with subclasses. 345 // 346 //----------------------------------------------------------------------------- 347 RuleBasedBreakIterator* 348 RuleBasedBreakIterator::clone() const { 349 return new RuleBasedBreakIterator(*this); 350 } 351 352 /** 353 * Equality operator. Returns true if both BreakIterators are of the 354 * same class, have the same behavior, and iterate over the same text. 355 */ 356 bool 357 RuleBasedBreakIterator::operator==(const BreakIterator& that) const { 358 if (typeid(*this) != typeid(that)) { 359 return false; 360 } 361 if (this == &that) { 362 return true; 363 } 364 365 // The base class BreakIterator carries no state that participates in equality, 366 // and does not implement an equality function that would otherwise be 367 // checked at this point. 368 369 const RuleBasedBreakIterator& that2 = static_cast<const RuleBasedBreakIterator&>(that); 370 371 if (!utext_equals(&fText, &that2.fText)) { 372 // The two break iterators are operating on different text, 373 // or have a different iteration position. 374 // Note that fText's position is always the same as the break iterator's position. 375 return false; 376 } 377 378 if (!(fPosition == that2.fPosition && 379 fRuleStatusIndex == that2.fRuleStatusIndex && 380 fDone == that2.fDone)) { 381 return false; 382 } 383 384 if (that2.fData == fData || 385 (fData != nullptr && that2.fData != nullptr && *that2.fData == *fData)) { 386 // The two break iterators are using the same rules. 387 return true; 388 } 389 return false; 390 } 391 392 /** 393 * Compute a hash code for this BreakIterator 394 * @return A hash code 395 */ 396 int32_t 397 RuleBasedBreakIterator::hashCode() const { 398 int32_t hash = 0; 399 if (fData != nullptr) { 400 hash = fData->hashCode(); 401 } 402 return hash; 403 } 404 405 406 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) { 407 if (U_FAILURE(status)) { 408 return; 409 } 410 fBreakCache->reset(); 411 fDictionaryCache->reset(); 412 utext_clone(&fText, ut, false, true, &status); 413 414 // Set up a dummy CharacterIterator to be returned if anyone 415 // calls getText(). With input from UText, there is no reasonable 416 // way to return a characterIterator over the actual input text. 417 // Return one over an empty string instead - this is the closest 418 // we can come to signaling a failure. 419 // (GetText() is obsolete, this failure is sort of OK) 420 fSCharIter.setText(u"", 0); 421 422 if (fCharIter != &fSCharIter) { 423 // existing fCharIter was adopted from the outside. Delete it now. 424 delete fCharIter; 425 } 426 fCharIter = &fSCharIter; 427 428 this->first(); 429 } 430 431 432 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const { 433 UText *result = utext_clone(fillIn, &fText, false, true, &status); 434 return result; 435 } 436 437 438 //======================================================================= 439 // BreakIterator overrides 440 //======================================================================= 441 442 /** 443 * Return a CharacterIterator over the text being analyzed. 444 */ 445 CharacterIterator& 446 RuleBasedBreakIterator::getText() const { 447 return *fCharIter; 448 } 449 450 /** 451 * Set the iterator to analyze a new piece of text. This function resets 452 * the current iteration position to the beginning of the text. 453 * @param newText An iterator over the text to analyze. 454 */ 455 void 456 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) { 457 // If we are holding a CharacterIterator adopted from a 458 // previous call to this function, delete it now. 459 if (fCharIter != &fSCharIter) { 460 delete fCharIter; 461 } 462 463 fCharIter = newText; 464 UErrorCode status = U_ZERO_ERROR; 465 fBreakCache->reset(); 466 fDictionaryCache->reset(); 467 if (newText==nullptr || newText->startIndex() != 0) { 468 // startIndex !=0 wants to be an error, but there's no way to report it. 469 // Make the iterator text be an empty string. 470 utext_openUChars(&fText, nullptr, 0, &status); 471 } else { 472 utext_openCharacterIterator(&fText, newText, &status); 473 } 474 this->first(); 475 } 476 477 /** 478 * Set the iterator to analyze a new piece of text. This function resets 479 * the current iteration position to the beginning of the text. 480 * @param newText An iterator over the text to analyze. 481 */ 482 void 483 RuleBasedBreakIterator::setText(const UnicodeString& newText) { 484 UErrorCode status = U_ZERO_ERROR; 485 fBreakCache->reset(); 486 fDictionaryCache->reset(); 487 utext_openConstUnicodeString(&fText, &newText, &status); 488 489 // Set up a character iterator on the string. 490 // Needed in case someone calls getText(). 491 // Can not, unfortunately, do this lazily on the (probably never) 492 // call to getText(), because getText is const. 493 fSCharIter.setText(newText.getBuffer(), newText.length()); 494 495 if (fCharIter != &fSCharIter) { 496 // old fCharIter was adopted from the outside. Delete it. 497 delete fCharIter; 498 } 499 fCharIter = &fSCharIter; 500 501 this->first(); 502 } 503 504 505 /** 506 * Provide a new UText for the input text. Must reference text with contents identical 507 * to the original. 508 * Intended for use with text data originating in Java (garbage collected) environments 509 * where the data may be moved in memory at arbitrary times. 510 */ 511 RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) { 512 if (U_FAILURE(status)) { 513 return *this; 514 } 515 if (input == nullptr) { 516 status = U_ILLEGAL_ARGUMENT_ERROR; 517 return *this; 518 } 519 int64_t pos = utext_getNativeIndex(&fText); 520 // Shallow read-only clone of the new UText into the existing input UText 521 utext_clone(&fText, input, false, true, &status); 522 if (U_FAILURE(status)) { 523 return *this; 524 } 525 utext_setNativeIndex(&fText, pos); 526 if (utext_getNativeIndex(&fText) != pos) { 527 // Sanity check. The new input utext is supposed to have the exact same 528 // contents as the old. If we can't set to the same position, it doesn't. 529 // The contents underlying the old utext might be invalid at this point, 530 // so it's not safe to check directly. 531 status = U_ILLEGAL_ARGUMENT_ERROR; 532 } 533 return *this; 534 } 535 536 537 /** 538 * Sets the current iteration position to the beginning of the text, position zero. 539 * @return The new iterator position, which is zero. 540 */ 541 int32_t RuleBasedBreakIterator::first() { 542 UErrorCode status = U_ZERO_ERROR; 543 if (!fBreakCache->seek(0)) { 544 fBreakCache->populateNear(0, status); 545 } 546 fBreakCache->current(); 547 U_ASSERT(fPosition == 0); 548 return 0; 549 } 550 551 /** 552 * Sets the current iteration position to the end of the text. 553 * @return The text's past-the-end offset. 554 */ 555 int32_t RuleBasedBreakIterator::last() { 556 int32_t endPos = static_cast<int32_t>(utext_nativeLength(&fText)); 557 UBool endShouldBeBoundary = isBoundary(endPos); // Has side effect of setting iterator position. 558 (void)endShouldBeBoundary; 559 U_ASSERT(endShouldBeBoundary); 560 U_ASSERT(fPosition == endPos); 561 return endPos; 562 } 563 564 /** 565 * Advances the iterator either forward or backward the specified number of steps. 566 * Negative values move backward, and positive values move forward. This is 567 * equivalent to repeatedly calling next() or previous(). 568 * @param n The number of steps to move. The sign indicates the direction 569 * (negative is backwards, and positive is forwards). 570 * @return The character offset of the boundary position n boundaries away from 571 * the current one. 572 */ 573 int32_t RuleBasedBreakIterator::next(int32_t n) { 574 int32_t result = 0; 575 if (n > 0) { 576 for (; n > 0 && result != UBRK_DONE; --n) { 577 result = next(); 578 } 579 } else if (n < 0) { 580 for (; n < 0 && result != UBRK_DONE; ++n) { 581 result = previous(); 582 } 583 } else { 584 result = current(); 585 } 586 return result; 587 } 588 589 /** 590 * Advances the iterator to the next boundary position. 591 * @return The position of the first boundary after this one. 592 */ 593 int32_t RuleBasedBreakIterator::next() { 594 fBreakCache->next(); 595 return fDone ? UBRK_DONE : fPosition; 596 } 597 598 /** 599 * Move the iterator backwards, to the boundary preceding the current one. 600 * 601 * Starts from the current position within fText. 602 * Starting position need not be on a boundary. 603 * 604 * @return The position of the boundary position immediately preceding the starting position. 605 */ 606 int32_t RuleBasedBreakIterator::previous() { 607 UErrorCode status = U_ZERO_ERROR; 608 fBreakCache->previous(status); 609 return fDone ? UBRK_DONE : fPosition; 610 } 611 612 /** 613 * Sets the iterator to refer to the first boundary position following 614 * the specified position. 615 * @param startPos The position from which to begin searching for a break position. 616 * @return The position of the first break after the current position. 617 */ 618 int32_t RuleBasedBreakIterator::following(int32_t startPos) { 619 // if the supplied position is before the beginning, return the 620 // text's starting offset 621 if (startPos < 0) { 622 return first(); 623 } 624 625 // Move requested offset to a code point start. It might be on a trail surrogate, 626 // or on a trail byte if the input is UTF-8. Or it may be beyond the end of the text. 627 utext_setNativeIndex(&fText, startPos); 628 startPos = static_cast<int32_t>(utext_getNativeIndex(&fText)); 629 630 UErrorCode status = U_ZERO_ERROR; 631 fBreakCache->following(startPos, status); 632 return fDone ? UBRK_DONE : fPosition; 633 } 634 635 /** 636 * Sets the iterator to refer to the last boundary position before the 637 * specified position. 638 * @param offset The position to begin searching for a break from. 639 * @return The position of the last boundary before the starting position. 640 */ 641 int32_t RuleBasedBreakIterator::preceding(int32_t offset) { 642 if (offset > utext_nativeLength(&fText)) { 643 return last(); 644 } 645 646 // Move requested offset to a code point start. It might be on a trail surrogate, 647 // or on a trail byte if the input is UTF-8. 648 649 utext_setNativeIndex(&fText, offset); 650 int32_t adjustedOffset = static_cast<int32_t>(utext_getNativeIndex(&fText)); 651 652 UErrorCode status = U_ZERO_ERROR; 653 fBreakCache->preceding(adjustedOffset, status); 654 return fDone ? UBRK_DONE : fPosition; 655 } 656 657 /** 658 * Returns true if the specified position is a boundary position. As a side 659 * effect, leaves the iterator pointing to the first boundary position at 660 * or after "offset". 661 * 662 * @param offset the offset to check. 663 * @return True if "offset" is a boundary position. 664 */ 665 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) { 666 // out-of-range indexes are never boundary positions 667 if (offset < 0) { 668 first(); // For side effects on current position, tag values. 669 return false; 670 } 671 672 // Adjust offset to be on a code point boundary and not beyond the end of the text. 673 // Note that isBoundary() is always false for offsets that are not on code point boundaries. 674 // But we still need the side effect of leaving iteration at the following boundary. 675 676 utext_setNativeIndex(&fText, offset); 677 int32_t adjustedOffset = static_cast<int32_t>(utext_getNativeIndex(&fText)); 678 679 bool result = false; 680 UErrorCode status = U_ZERO_ERROR; 681 if (fBreakCache->seek(adjustedOffset) || fBreakCache->populateNear(adjustedOffset, status)) { 682 result = (fBreakCache->current() == offset); 683 } 684 685 if (result && adjustedOffset < offset && utext_char32At(&fText, offset) == U_SENTINEL) { 686 // Original offset is beyond the end of the text. Return false, it's not a boundary, 687 // but the iteration position remains set to the end of the text, which is a boundary. 688 return false; 689 } 690 if (!result) { 691 // Not on a boundary. isBoundary() must leave iterator on the following boundary. 692 // Cache->seek(), above, left us on the preceding boundary, so advance one. 693 next(); 694 } 695 return result; 696 } 697 698 699 /** 700 * Returns the current iteration position. 701 * @return The current iteration position. 702 */ 703 int32_t RuleBasedBreakIterator::current() const { 704 return fPosition; 705 } 706 707 708 //======================================================================= 709 // implementation 710 //======================================================================= 711 712 // 713 // RBBIRunMode - the state machine runs an extra iteration at the beginning and end 714 // of user text. A variable with this enum type keeps track of where we 715 // are. The state machine only fetches user input while in the RUN mode. 716 // 717 enum RBBIRunMode { 718 RBBI_START, // state machine processing is before first char of input 719 RBBI_RUN, // state machine processing is in the user text 720 RBBI_END // state machine processing is after end of user text. 721 }; 722 723 724 // Wrapper functions to select the appropriate handleNext() or handleSafePrevious() 725 // instantiation, based on whether an 8 or 16 bit table is required. 726 // 727 // These Trie access functions will be inlined within the handleNext()/Previous() instantions. 728 static inline uint16_t TrieFunc8(const UCPTrie *trie, UChar32 c) { 729 return UCPTRIE_FAST_GET(trie, UCPTRIE_8, c); 730 } 731 732 static inline uint16_t TrieFunc16(const UCPTrie *trie, UChar32 c) { 733 return UCPTRIE_FAST_GET(trie, UCPTRIE_16, c); 734 } 735 736 int32_t RuleBasedBreakIterator::handleNext() { 737 const RBBIStateTable *statetable = fData->fForwardTable; 738 bool use8BitsTrie = ucptrie_getValueWidth(fData->fTrie) == UCPTRIE_VALUE_BITS_8; 739 if (statetable->fFlags & RBBI_8BITS_ROWS) { 740 if (use8BitsTrie) { 741 return handleNext<RBBIStateTableRow8, TrieFunc8>(); 742 } else { 743 return handleNext<RBBIStateTableRow8, TrieFunc16>(); 744 } 745 } else { 746 if (use8BitsTrie) { 747 return handleNext<RBBIStateTableRow16, TrieFunc8>(); 748 } else { 749 return handleNext<RBBIStateTableRow16, TrieFunc16>(); 750 } 751 } 752 } 753 754 int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) { 755 const RBBIStateTable *statetable = fData->fReverseTable; 756 bool use8BitsTrie = ucptrie_getValueWidth(fData->fTrie) == UCPTRIE_VALUE_BITS_8; 757 if (statetable->fFlags & RBBI_8BITS_ROWS) { 758 if (use8BitsTrie) { 759 return handleSafePrevious<RBBIStateTableRow8, TrieFunc8>(fromPosition); 760 } else { 761 return handleSafePrevious<RBBIStateTableRow8, TrieFunc16>(fromPosition); 762 } 763 } else { 764 if (use8BitsTrie) { 765 return handleSafePrevious<RBBIStateTableRow16, TrieFunc8>(fromPosition); 766 } else { 767 return handleSafePrevious<RBBIStateTableRow16, TrieFunc16>(fromPosition); 768 } 769 } 770 } 771 772 773 //----------------------------------------------------------------------------------- 774 // 775 // handleNext() 776 // Run the state machine to find a boundary 777 // 778 //----------------------------------------------------------------------------------- 779 template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc> 780 int32_t RuleBasedBreakIterator::handleNext() { 781 int32_t state; 782 uint16_t category = 0; 783 RBBIRunMode mode; 784 785 RowType *row; 786 UChar32 c; 787 int32_t result = 0; 788 int32_t initialPosition = 0; 789 const RBBIStateTable *statetable = fData->fForwardTable; 790 const char *tableData = statetable->fTableData; 791 uint32_t tableRowLen = statetable->fRowLen; 792 uint32_t dictStart = statetable->fDictCategoriesStart; 793 #ifdef RBBI_DEBUG 794 if (gTrace) { 795 RBBIDebugPuts("Handle Next pos char state category"); 796 } 797 #endif 798 799 // handleNext always sets the break tag value. 800 // Set the default for it. 801 fRuleStatusIndex = 0; 802 803 fDictionaryCharCount = 0; 804 805 // if we're already at the end of the text, return DONE. 806 initialPosition = fPosition; 807 UTEXT_SETNATIVEINDEX(&fText, initialPosition); 808 result = initialPosition; 809 c = UTEXT_NEXT32(&fText); 810 if (c==U_SENTINEL) { 811 fDone = true; 812 return UBRK_DONE; 813 } 814 815 // Set the initial state for the state machine 816 state = START_STATE; 817 row = (RowType *) 818 //(statetable->fTableData + (statetable->fRowLen * state)); 819 (tableData + tableRowLen * state); 820 821 822 mode = RBBI_RUN; 823 if (statetable->fFlags & RBBI_BOF_REQUIRED) { 824 category = 2; 825 mode = RBBI_START; 826 } 827 828 829 // loop until we reach the end of the text or transition to state 0 830 // 831 for (;;) { 832 if (c == U_SENTINEL) { 833 // Reached end of input string. 834 if (mode == RBBI_END) { 835 // We have already run the loop one last time with the 836 // character set to the psueudo {eof} value. Now it is time 837 // to unconditionally bail out. 838 break; 839 } 840 // Run the loop one last time with the fake end-of-input character category. 841 mode = RBBI_END; 842 category = 1; 843 } 844 845 // 846 // Get the char category. An incoming category of 1 or 2 means that 847 // we are preset for doing the beginning or end of input, and 848 // that we shouldn't get a category from an actual text input character. 849 // 850 if (mode == RBBI_RUN) { 851 // look up the current character's character category, which tells us 852 // which column in the state table to look at. 853 category = trieFunc(fData->fTrie, c); 854 fDictionaryCharCount += (category >= dictStart); 855 } 856 857 #ifdef RBBI_DEBUG 858 if (gTrace) { 859 RBBIDebugPrintf(" %4" PRId64 " ", utext_getNativeIndex(&fText)); 860 if (0x20<=c && c<0x7f) { 861 RBBIDebugPrintf("\"%c\" ", c); 862 } else { 863 RBBIDebugPrintf("%5x ", c); 864 } 865 RBBIDebugPrintf("%3d %3d\n", state, category); 866 } 867 #endif 868 869 // State Transition - move machine to its next state 870 // 871 872 // fNextState is a variable-length array. 873 U_ASSERT(category<fData->fHeader->fCatCount); 874 state = row->fNextState[category]; /*Not accessing beyond memory*/ 875 row = (RowType *) 876 // (statetable->fTableData + (statetable->fRowLen * state)); 877 (tableData + tableRowLen * state); 878 879 880 uint16_t accepting = row->fAccepting; 881 if (accepting == ACCEPTING_UNCONDITIONAL) { 882 // Match found, common case. 883 if (mode != RBBI_START) { 884 result = static_cast<int32_t>(UTEXT_GETNATIVEINDEX(&fText)); 885 } 886 fRuleStatusIndex = row->fTagsIdx; // Remember the break status (tag) values. 887 } else if (accepting > ACCEPTING_UNCONDITIONAL) { 888 // Lookahead match is completed. 889 U_ASSERT(accepting < fData->fForwardTable->fLookAheadResultsSize); 890 int32_t lookaheadResult = fLookAheadMatches[accepting]; 891 if (lookaheadResult >= 0) { 892 fRuleStatusIndex = row->fTagsIdx; 893 fPosition = lookaheadResult; 894 return lookaheadResult; 895 } 896 } 897 898 // If we are at the position of the '/' in a look-ahead (hard break) rule; 899 // record the current position, to be returned later, if the full rule matches. 900 // TODO: Move this check before the previous check of fAccepting. 901 // This would enable hard-break rules with no following context. 902 // But there are line break test failures when trying this. Investigate. 903 // Issue ICU-20837 904 uint16_t rule = row->fLookAhead; 905 U_ASSERT(rule == 0 || rule > ACCEPTING_UNCONDITIONAL); 906 U_ASSERT(rule == 0 || rule < fData->fForwardTable->fLookAheadResultsSize); 907 if (rule > ACCEPTING_UNCONDITIONAL) { 908 int32_t pos = static_cast<int32_t>(UTEXT_GETNATIVEINDEX(&fText)); 909 fLookAheadMatches[rule] = pos; 910 } 911 912 if (state == STOP_STATE) { 913 // This is the normal exit from the lookup state machine. 914 // We have advanced through the string until it is certain that no 915 // longer match is possible, no matter what characters follow. 916 break; 917 } 918 919 // Advance to the next character. 920 // If this is a beginning-of-input loop iteration, don't advance 921 // the input position. The next iteration will be processing the 922 // first real input character. 923 if (mode == RBBI_RUN) { 924 c = UTEXT_NEXT32(&fText); 925 } else { 926 if (mode == RBBI_START) { 927 mode = RBBI_RUN; 928 } 929 } 930 } 931 932 // The state machine is done. Check whether it found a match... 933 934 // If the iterator failed to advance in the match engine, force it ahead by one. 935 // (This really indicates a defect in the break rules. They should always match 936 // at least one character.) 937 if (result == initialPosition) { 938 utext_setNativeIndex(&fText, initialPosition); 939 utext_next32(&fText); 940 result = static_cast<int32_t>(utext_getNativeIndex(&fText)); 941 fRuleStatusIndex = 0; 942 } 943 944 // Leave the iterator at our result position. 945 fPosition = result; 946 #ifdef RBBI_DEBUG 947 if (gTrace) { 948 RBBIDebugPrintf("result = %d\n\n", result); 949 } 950 #endif 951 return result; 952 } 953 954 955 //----------------------------------------------------------------------------------- 956 // 957 // handleSafePrevious() 958 // 959 // Iterate backwards using the safe reverse rules. 960 // The logic of this function is similar to handleNext(), but simpler 961 // because the safe table does not require as many options. 962 // 963 //----------------------------------------------------------------------------------- 964 template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc> 965 int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) { 966 967 int32_t state; 968 uint16_t category = 0; 969 RowType *row; 970 UChar32 c; 971 int32_t result = 0; 972 973 const RBBIStateTable *stateTable = fData->fReverseTable; 974 UTEXT_SETNATIVEINDEX(&fText, fromPosition); 975 #ifdef RBBI_DEBUG 976 if (gTrace) { 977 RBBIDebugPuts("Handle Previous pos char state category"); 978 } 979 #endif 980 981 // if we're already at the start of the text, return DONE. 982 if (fData == nullptr || UTEXT_GETNATIVEINDEX(&fText)==0) { 983 return BreakIterator::DONE; 984 } 985 986 // Set the initial state for the state machine 987 c = UTEXT_PREVIOUS32(&fText); 988 state = START_STATE; 989 row = (RowType *) 990 (stateTable->fTableData + (stateTable->fRowLen * state)); 991 992 // loop until we reach the start of the text or transition to state 0 993 // 994 for (; c != U_SENTINEL; c = UTEXT_PREVIOUS32(&fText)) { 995 996 // look up the current character's character category, which tells us 997 // which column in the state table to look at. 998 // 999 // Off the dictionary flag bit. For reverse iteration it is not used. 1000 category = trieFunc(fData->fTrie, c); 1001 1002 #ifdef RBBI_DEBUG 1003 if (gTrace) { 1004 RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(&fText)); 1005 if (0x20<=c && c<0x7f) { 1006 RBBIDebugPrintf("\"%c\" ", c); 1007 } else { 1008 RBBIDebugPrintf("%5x ", c); 1009 } 1010 RBBIDebugPrintf("%3d %3d\n", state, category); 1011 } 1012 #endif 1013 1014 // State Transition - move machine to its next state 1015 // 1016 // fNextState is a variable-length array. 1017 U_ASSERT(category<fData->fHeader->fCatCount); 1018 state = row->fNextState[category]; /*Not accessing beyond memory*/ 1019 row = (RowType *) 1020 (stateTable->fTableData + (stateTable->fRowLen * state)); 1021 1022 if (state == STOP_STATE) { 1023 // This is the normal exit from the lookup state machine. 1024 // Transition to state zero means we have found a safe point. 1025 break; 1026 } 1027 } 1028 1029 // The state machine is done. Check whether it found a match... 1030 result = static_cast<int32_t>(UTEXT_GETNATIVEINDEX(&fText)); 1031 #ifdef RBBI_DEBUG 1032 if (gTrace) { 1033 RBBIDebugPrintf("result = %d\n\n", result); 1034 } 1035 #endif 1036 return result; 1037 } 1038 1039 1040 //------------------------------------------------------------------------------- 1041 // 1042 // getRuleStatus() Return the break rule tag associated with the current 1043 // iterator position. If the iterator arrived at its current 1044 // position by iterating forwards, the value will have been 1045 // cached by the handleNext() function. 1046 // 1047 //------------------------------------------------------------------------------- 1048 1049 int32_t RuleBasedBreakIterator::getRuleStatus() const { 1050 1051 // fLastRuleStatusIndex indexes to the start of the appropriate status record 1052 // (the number of status values.) 1053 // This function returns the last (largest) of the array of status values. 1054 int32_t idx = fRuleStatusIndex + fData->fRuleStatusTable[fRuleStatusIndex]; 1055 int32_t tagVal = fData->fRuleStatusTable[idx]; 1056 1057 return tagVal; 1058 } 1059 1060 1061 int32_t RuleBasedBreakIterator::getRuleStatusVec( 1062 int32_t *fillInVec, int32_t capacity, UErrorCode &status) { 1063 if (U_FAILURE(status)) { 1064 return 0; 1065 } 1066 1067 int32_t numVals = fData->fRuleStatusTable[fRuleStatusIndex]; 1068 int32_t numValsToCopy = numVals; 1069 if (numVals > capacity) { 1070 status = U_BUFFER_OVERFLOW_ERROR; 1071 numValsToCopy = capacity; 1072 } 1073 int i; 1074 for (i=0; i<numValsToCopy; i++) { 1075 fillInVec[i] = fData->fRuleStatusTable[fRuleStatusIndex + i + 1]; 1076 } 1077 return numVals; 1078 } 1079 1080 1081 1082 //------------------------------------------------------------------------------- 1083 // 1084 // getBinaryRules Access to the compiled form of the rules, 1085 // for use by build system tools that save the data 1086 // for standard iterator types. 1087 // 1088 //------------------------------------------------------------------------------- 1089 const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) { 1090 const uint8_t *retPtr = nullptr; 1091 length = 0; 1092 1093 if (fData != nullptr) { 1094 retPtr = reinterpret_cast<const uint8_t*>(fData->fHeader); 1095 length = fData->fHeader->fLength; 1096 } 1097 return retPtr; 1098 } 1099 1100 1101 RuleBasedBreakIterator *RuleBasedBreakIterator::createBufferClone( 1102 void * /*stackBuffer*/, int32_t &bufferSize, UErrorCode &status) { 1103 if (U_FAILURE(status)){ 1104 return nullptr; 1105 } 1106 1107 if (bufferSize == 0) { 1108 bufferSize = 1; // preflighting for deprecated functionality 1109 return nullptr; 1110 } 1111 1112 BreakIterator *clonedBI = clone(); 1113 if (clonedBI == nullptr) { 1114 status = U_MEMORY_ALLOCATION_ERROR; 1115 } else { 1116 status = U_SAFECLONE_ALLOCATED_WARNING; 1117 } 1118 return (RuleBasedBreakIterator *)clonedBI; 1119 } 1120 1121 U_NAMESPACE_END 1122 1123 1124 static icu::UStack *gLanguageBreakFactories = nullptr; 1125 static const icu::UnicodeString *gEmptyString = nullptr; 1126 static icu::UInitOnce gLanguageBreakFactoriesInitOnce {}; 1127 static icu::UInitOnce gRBBIInitOnce {}; 1128 static icu::ICULanguageBreakFactory *gICULanguageBreakFactory = nullptr; 1129 1130 /** 1131 * Release all static memory held by breakiterator. 1132 */ 1133 U_CDECL_BEGIN 1134 UBool U_CALLCONV rbbi_cleanup() { 1135 delete gLanguageBreakFactories; 1136 gLanguageBreakFactories = nullptr; 1137 delete gEmptyString; 1138 gEmptyString = nullptr; 1139 gLanguageBreakFactoriesInitOnce.reset(); 1140 gRBBIInitOnce.reset(); 1141 return true; 1142 } 1143 U_CDECL_END 1144 1145 U_CDECL_BEGIN 1146 static void U_CALLCONV _deleteFactory(void *obj) { 1147 delete (icu::LanguageBreakFactory *) obj; 1148 } 1149 U_CDECL_END 1150 U_NAMESPACE_BEGIN 1151 1152 static void U_CALLCONV rbbiInit() { 1153 gEmptyString = new UnicodeString(); 1154 ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup); 1155 } 1156 1157 static void U_CALLCONV initLanguageFactories(UErrorCode& status) { 1158 U_ASSERT(gLanguageBreakFactories == nullptr); 1159 gLanguageBreakFactories = new UStack(_deleteFactory, nullptr, status); 1160 if (gLanguageBreakFactories != nullptr && U_SUCCESS(status)) { 1161 LocalPointer<ICULanguageBreakFactory> factory(new ICULanguageBreakFactory(status), status); 1162 if (U_SUCCESS(status)) { 1163 gICULanguageBreakFactory = factory.orphan(); 1164 gLanguageBreakFactories->push(gICULanguageBreakFactory, status); 1165 #ifdef U_LOCAL_SERVICE_HOOK 1166 LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status); 1167 if (extra != nullptr) { 1168 gLanguageBreakFactories->push(extra, status); 1169 } 1170 #endif 1171 } 1172 } 1173 ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup); 1174 } 1175 1176 void ensureLanguageFactories(UErrorCode& status) { 1177 umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories, status); 1178 } 1179 1180 static const LanguageBreakEngine* 1181 getLanguageBreakEngineFromFactory(UChar32 c, const char* locale) 1182 { 1183 UErrorCode status = U_ZERO_ERROR; 1184 ensureLanguageFactories(status); 1185 if (U_FAILURE(status)) return nullptr; 1186 1187 int32_t i = gLanguageBreakFactories->size(); 1188 const LanguageBreakEngine *lbe = nullptr; 1189 while (--i >= 0) { 1190 LanguageBreakFactory* factory = static_cast<LanguageBreakFactory*>(gLanguageBreakFactories->elementAt(i)); 1191 lbe = factory->getEngineFor(c, locale); 1192 if (lbe != nullptr) { 1193 break; 1194 } 1195 } 1196 return lbe; 1197 } 1198 1199 1200 //------------------------------------------------------------------------------- 1201 // 1202 // getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the 1203 // the character c. 1204 // 1205 //------------------------------------------------------------------------------- 1206 const LanguageBreakEngine * 1207 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c, const char* locale) { 1208 const LanguageBreakEngine *lbe = nullptr; 1209 UErrorCode status = U_ZERO_ERROR; 1210 1211 if (fLanguageBreakEngines == nullptr) { 1212 fLanguageBreakEngines = new UStack(status); 1213 if (fLanguageBreakEngines == nullptr || U_FAILURE(status)) { 1214 delete fLanguageBreakEngines; 1215 fLanguageBreakEngines = nullptr; 1216 return nullptr; 1217 } 1218 } 1219 1220 int32_t i = fLanguageBreakEngines->size(); 1221 while (--i >= 0) { 1222 lbe = static_cast<const LanguageBreakEngine*>(fLanguageBreakEngines->elementAt(i)); 1223 if (lbe->handles(c, locale)) { 1224 return lbe; 1225 } 1226 } 1227 1228 // No existing dictionary took the character. See if a factory wants to 1229 // give us a new LanguageBreakEngine for this character. 1230 lbe = getLanguageBreakEngineFromFactory(c, locale); 1231 1232 // If we got one, use it and push it on our stack. 1233 if (lbe != nullptr) { 1234 fLanguageBreakEngines->push((void *)lbe, status); 1235 // Even if we can't remember it, we can keep looking it up, so 1236 // return it even if the push fails. 1237 return lbe; 1238 } 1239 1240 // No engine is forthcoming for this character. Add it to the 1241 // reject set. Create the reject break engine if needed. 1242 if (fUnhandledBreakEngine == nullptr) { 1243 fUnhandledBreakEngine = new UnhandledEngine(status); 1244 if (U_SUCCESS(status) && fUnhandledBreakEngine == nullptr) { 1245 status = U_MEMORY_ALLOCATION_ERROR; 1246 return nullptr; 1247 } 1248 // Put it last so that scripts for which we have an engine get tried 1249 // first. 1250 fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status); 1251 // If we can't insert it, or creation failed, get rid of it 1252 U_ASSERT(!fLanguageBreakEngines->hasDeleter()); 1253 if (U_FAILURE(status)) { 1254 delete fUnhandledBreakEngine; 1255 fUnhandledBreakEngine = nullptr; 1256 return nullptr; 1257 } 1258 } 1259 1260 // Tell the reject engine about the character; at its discretion, it may 1261 // add more than just the one character. 1262 fUnhandledBreakEngine->handleCharacter(c); 1263 1264 return fUnhandledBreakEngine; 1265 } 1266 1267 #ifndef U_HIDE_DRAFT_API 1268 void U_EXPORT2 RuleBasedBreakIterator::registerExternalBreakEngine( 1269 ExternalBreakEngine* toAdopt, UErrorCode& status) { 1270 LocalPointer<ExternalBreakEngine> engine(toAdopt, status); 1271 if (U_FAILURE(status)) return; 1272 ensureLanguageFactories(status); 1273 if (U_FAILURE(status)) return; 1274 gICULanguageBreakFactory->addExternalEngine(engine.orphan(), status); 1275 } 1276 #endif /* U_HIDE_DRAFT_API */ 1277 1278 1279 void RuleBasedBreakIterator::dumpCache() { 1280 fBreakCache->dumpCache(); 1281 } 1282 1283 void RuleBasedBreakIterator::dumpTables() { 1284 fData->printData(); 1285 } 1286 1287 /** 1288 * Returns the description used to create this iterator 1289 */ 1290 1291 const UnicodeString& 1292 RuleBasedBreakIterator::getRules() const { 1293 if (fData != nullptr) { 1294 return fData->getRuleSourceString(); 1295 } else { 1296 umtx_initOnce(gRBBIInitOnce, &rbbiInit); 1297 return *gEmptyString; 1298 } 1299 } 1300 1301 U_NAMESPACE_END 1302 1303 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */