tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

rbbi.cpp (44554B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 ***************************************************************************
      5 *   Copyright (C) 1999-2016 International Business Machines Corporation
      6 *   and others. All rights reserved.
      7 ***************************************************************************
      8 */
      9 //
     10 //  file:  rbbi.cpp  Contains the implementation of the rule based break iterator
     11 //                   runtime engine and the API implementation for
     12 //                   class RuleBasedBreakIterator
     13 //
     14 
     15 #include "utypeinfo.h"  // for 'typeid' to work
     16 
     17 #include "unicode/utypes.h"
     18 
     19 #if !UCONFIG_NO_BREAK_ITERATION
     20 
     21 #include <cinttypes>
     22 
     23 #include "unicode/rbbi.h"
     24 #include "unicode/schriter.h"
     25 #include "unicode/uchriter.h"
     26 #include "unicode/uclean.h"
     27 #include "unicode/udata.h"
     28 
     29 #include "brkeng.h"
     30 #include "ucln_cmn.h"
     31 #include "cmemory.h"
     32 #include "cstring.h"
     33 #include "localsvc.h"
     34 #include "rbbidata.h"
     35 #include "rbbi_cache.h"
     36 #include "rbbirb.h"
     37 #include "uassert.h"
     38 #include "umutex.h"
     39 #include "uvectr32.h"
     40 
     41 #ifdef RBBI_DEBUG
     42 static UBool gTrace = false;
     43 #endif
     44 
     45 U_NAMESPACE_BEGIN
     46 
     47 // The state number of the starting state
     48 constexpr int32_t START_STATE = 1;
     49 
     50 // The state-transition value indicating "stop"
     51 constexpr int32_t STOP_STATE = 0;
     52 
     53 
     54 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
     55 
     56 
     57 //=======================================================================
     58 // constructors
     59 //=======================================================================
     60 
     61 /**
     62 * Constructs a RuleBasedBreakIterator that uses the already-created
     63 * tables object that is passed in as a parameter.
     64 */
     65 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
     66 : RuleBasedBreakIterator(&status)
     67 {
     68    fData = new RBBIDataWrapper(data, status); // status checked in constructor
     69    if (U_FAILURE(status)) {return;}
     70    if(fData == nullptr) {
     71        status = U_MEMORY_ALLOCATION_ERROR;
     72        return;
     73    }
     74    if (fData->fForwardTable->fLookAheadResultsSize > 0) {
     75        fLookAheadMatches = static_cast<int32_t *>(
     76            uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t)));
     77        if (fLookAheadMatches == nullptr) {
     78            status = U_MEMORY_ALLOCATION_ERROR;
     79            return;
     80        }
     81    }
     82 }
     83 
     84 //-------------------------------------------------------------------------------
     85 //
     86 //   Constructor   from a UDataMemory handle to precompiled break rules
     87 //                 stored in an ICU data file. This construcotr is private API,
     88 //                 only for internal use.
     89 //
     90 //-------------------------------------------------------------------------------
     91 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UBool isPhraseBreaking,
     92        UErrorCode &status) : RuleBasedBreakIterator(udm, status)
     93 {
     94    fIsPhraseBreaking = isPhraseBreaking;
     95 }
     96 
     97 //
     98 //  Construct from precompiled binary rules (tables).  This constructor is public API,
     99 //  taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules().
    100 //
    101 RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
    102                       uint32_t       ruleLength,
    103                       UErrorCode     &status)
    104 : RuleBasedBreakIterator(&status)
    105 {
    106    if (U_FAILURE(status)) {
    107        return;
    108    }
    109    if (compiledRules == nullptr || ruleLength < sizeof(RBBIDataHeader)) {
    110        status = U_ILLEGAL_ARGUMENT_ERROR;
    111        return;
    112    }
    113    const RBBIDataHeader* data = reinterpret_cast<const RBBIDataHeader*>(compiledRules);
    114    if (data->fLength > ruleLength) {
    115        status = U_ILLEGAL_ARGUMENT_ERROR;
    116        return;
    117    }
    118    fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status);
    119    if (U_FAILURE(status)) {return;}
    120    if(fData == nullptr) {
    121        status = U_MEMORY_ALLOCATION_ERROR;
    122        return;
    123    }
    124    if (fData->fForwardTable->fLookAheadResultsSize > 0) {
    125        fLookAheadMatches = static_cast<int32_t *>(
    126            uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t)));
    127        if (fLookAheadMatches == nullptr) {
    128            status = U_MEMORY_ALLOCATION_ERROR;
    129            return;
    130        }
    131    }
    132 }
    133 
    134 
    135 //-------------------------------------------------------------------------------
    136 //
    137 //   Constructor   from a UDataMemory handle to precompiled break rules
    138 //                 stored in an ICU data file.
    139 //
    140 //-------------------------------------------------------------------------------
    141 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
    142 : RuleBasedBreakIterator(&status)
    143 {
    144    fData = new RBBIDataWrapper(udm, status); // status checked in constructor
    145    if (U_FAILURE(status)) {return;}
    146    if(fData == nullptr) {
    147        status = U_MEMORY_ALLOCATION_ERROR;
    148        return;
    149    }
    150    if (fData->fForwardTable->fLookAheadResultsSize > 0) {
    151        fLookAheadMatches = static_cast<int32_t *>(
    152            uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t)));
    153        if (fLookAheadMatches == nullptr) {
    154            status = U_MEMORY_ALLOCATION_ERROR;
    155            return;
    156        }
    157    }
    158 }
    159 
    160 
    161 
    162 //-------------------------------------------------------------------------------
    163 //
    164 //   Constructor       from a set of rules supplied as a string.
    165 //
    166 //-------------------------------------------------------------------------------
    167 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  &rules,
    168                                                UParseError          &parseError,
    169                                                UErrorCode           &status)
    170 : RuleBasedBreakIterator(&status)
    171 {
    172    if (U_FAILURE(status)) {return;}
    173    RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
    174        RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
    175    // Note:  This is a bit awkward.  The RBBI ruleBuilder has a factory method that
    176    //        creates and returns a complete RBBI.  From here, in a constructor, we
    177    //        can't just return the object created by the builder factory, hence
    178    //        the assignment of the factory created object to "this".
    179    if (U_SUCCESS(status)) {
    180        *this = *bi;
    181        delete bi;
    182    }
    183 }
    184 
    185 
    186 //-------------------------------------------------------------------------------
    187 //
    188 // Default Constructor.      Create an empty shell that can be set up later.
    189 //                           Used when creating a RuleBasedBreakIterator from a set
    190 //                           of rules.
    191 //-------------------------------------------------------------------------------
    192 RuleBasedBreakIterator::RuleBasedBreakIterator()
    193 : RuleBasedBreakIterator(nullptr)
    194 {
    195 }
    196 
    197 /**
    198 * Simple Constructor with an error code.
    199 * Handles common initialization for all other constructors.
    200 */
    201 RuleBasedBreakIterator::RuleBasedBreakIterator(UErrorCode *status) {
    202    UErrorCode ec = U_ZERO_ERROR;
    203    if (status == nullptr) {
    204        status = &ec;
    205    }
    206    utext_openUChars(&fText, nullptr, 0, status);
    207    LocalPointer<DictionaryCache> lpDictionaryCache(new DictionaryCache(this, *status), *status);
    208    LocalPointer<BreakCache> lpBreakCache(new BreakCache(this, *status), *status);
    209    if (U_FAILURE(*status)) {
    210        fErrorCode = *status;
    211        return;
    212    }
    213    fDictionaryCache = lpDictionaryCache.orphan();
    214    fBreakCache = lpBreakCache.orphan();
    215 
    216 #ifdef RBBI_DEBUG
    217    static UBool debugInitDone = false;
    218    if (debugInitDone == false) {
    219        char *debugEnv = getenv("U_RBBIDEBUG");
    220        if (debugEnv && uprv_strstr(debugEnv, "trace")) {
    221            gTrace = true;
    222        }
    223        debugInitDone = true;
    224    }
    225 #endif
    226 }
    227 
    228 
    229 //-------------------------------------------------------------------------------
    230 //
    231 //   Copy constructor.  Will produce a break iterator with the same behavior,
    232 //                      and which iterates over the same text, as the one passed in.
    233 //
    234 //-------------------------------------------------------------------------------
    235 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
    236 : RuleBasedBreakIterator()
    237 {
    238    *this = other;
    239 }
    240 
    241 
    242 /**
    243 * Destructor
    244 */
    245 RuleBasedBreakIterator::~RuleBasedBreakIterator() {
    246    if (fCharIter != &fSCharIter) {
    247        // fCharIter was adopted from the outside.
    248        delete fCharIter;
    249    }
    250    fCharIter = nullptr;
    251 
    252    utext_close(&fText);
    253 
    254    if (fData != nullptr) {
    255        fData->removeReference();
    256        fData = nullptr;
    257    }
    258    delete fBreakCache;
    259    fBreakCache = nullptr;
    260 
    261    delete fDictionaryCache;
    262    fDictionaryCache = nullptr;
    263 
    264    delete fLanguageBreakEngines;
    265    fLanguageBreakEngines = nullptr;
    266 
    267    delete fUnhandledBreakEngine;
    268    fUnhandledBreakEngine = nullptr;
    269 
    270    uprv_free(fLookAheadMatches);
    271    fLookAheadMatches = nullptr;
    272 }
    273 
    274 /**
    275 * Assignment operator.  Sets this iterator to have the same behavior,
    276 * and iterate over the same text, as the one passed in.
    277 * TODO: needs better handling of memory allocation errors.
    278 */
    279 RuleBasedBreakIterator&
    280 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
    281    if (this == &that) {
    282        return *this;
    283    }
    284    BreakIterator::operator=(that);
    285 
    286    if (fLanguageBreakEngines != nullptr) {
    287        delete fLanguageBreakEngines;
    288        fLanguageBreakEngines = nullptr;   // Just rebuild for now
    289    }
    290    // TODO: clone fLanguageBreakEngines from "that"
    291    UErrorCode status = U_ZERO_ERROR;
    292    utext_clone(&fText, &that.fText, false, true, &status);
    293 
    294    if (fCharIter != &fSCharIter) {
    295        delete fCharIter;
    296    }
    297    fCharIter = &fSCharIter;
    298 
    299    if (that.fCharIter != nullptr && that.fCharIter != &that.fSCharIter) {
    300        // This is a little bit tricky - it will initially appear that
    301        //  this->fCharIter is adopted, even if that->fCharIter was
    302        //  not adopted.  That's ok.
    303        fCharIter = that.fCharIter->clone();
    304    }
    305    fSCharIter = that.fSCharIter;
    306    if (fCharIter == nullptr) {
    307        fCharIter = &fSCharIter;
    308    }
    309 
    310    if (fData != nullptr) {
    311        fData->removeReference();
    312        fData = nullptr;
    313    }
    314    if (that.fData != nullptr) {
    315        fData = that.fData->addReference();
    316    }
    317 
    318    uprv_free(fLookAheadMatches);
    319    fLookAheadMatches = nullptr;
    320    if (fData && fData->fForwardTable->fLookAheadResultsSize > 0) {
    321        fLookAheadMatches = static_cast<int32_t *>(
    322            uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t)));
    323    }
    324 
    325 
    326    fPosition = that.fPosition;
    327    fRuleStatusIndex = that.fRuleStatusIndex;
    328    fDone = that.fDone;
    329 
    330    // TODO: both the dictionary and the main cache need to be copied.
    331    //       Current position could be within a dictionary range. Trying to continue
    332    //       the iteration without the caches present would go to the rules, with
    333    //       the assumption that the current position is on a rule boundary.
    334    fBreakCache->reset(fPosition, fRuleStatusIndex);
    335    fDictionaryCache->reset();
    336 
    337    return *this;
    338 }
    339 
    340 //-----------------------------------------------------------------------------
    341 //
    342 //    clone - Returns a newly-constructed RuleBasedBreakIterator with the same
    343 //            behavior, and iterating over the same text, as this one.
    344 //            Virtual function: does the right thing with subclasses.
    345 //
    346 //-----------------------------------------------------------------------------
    347 RuleBasedBreakIterator*
    348 RuleBasedBreakIterator::clone() const {
    349    return new RuleBasedBreakIterator(*this);
    350 }
    351 
    352 /**
    353 * Equality operator.  Returns true if both BreakIterators are of the
    354 * same class, have the same behavior, and iterate over the same text.
    355 */
    356 bool
    357 RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
    358    if (typeid(*this) != typeid(that)) {
    359        return false;
    360    }
    361    if (this == &that) {
    362        return true;
    363    }
    364 
    365    // The base class BreakIterator carries no state that participates in equality,
    366    // and does not implement an equality function that would otherwise be
    367    // checked at this point.
    368 
    369    const RuleBasedBreakIterator& that2 = static_cast<const RuleBasedBreakIterator&>(that);
    370 
    371    if (!utext_equals(&fText, &that2.fText)) {
    372        // The two break iterators are operating on different text,
    373        //   or have a different iteration position.
    374        //   Note that fText's position is always the same as the break iterator's position.
    375        return false;
    376    }
    377 
    378    if (!(fPosition == that2.fPosition &&
    379            fRuleStatusIndex == that2.fRuleStatusIndex &&
    380            fDone == that2.fDone)) {
    381        return false;
    382    }
    383 
    384    if (that2.fData == fData ||
    385        (fData != nullptr && that2.fData != nullptr && *that2.fData == *fData)) {
    386            // The two break iterators are using the same rules.
    387            return true;
    388        }
    389    return false;
    390 }
    391 
    392 /**
    393 * Compute a hash code for this BreakIterator
    394 * @return A hash code
    395 */
    396 int32_t
    397 RuleBasedBreakIterator::hashCode() const {
    398    int32_t   hash = 0;
    399    if (fData != nullptr) {
    400        hash = fData->hashCode();
    401    }
    402    return hash;
    403 }
    404 
    405 
    406 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
    407    if (U_FAILURE(status)) {
    408        return;
    409    }
    410    fBreakCache->reset();
    411    fDictionaryCache->reset();
    412    utext_clone(&fText, ut, false, true, &status);
    413 
    414    // Set up a dummy CharacterIterator to be returned if anyone
    415    //   calls getText().  With input from UText, there is no reasonable
    416    //   way to return a characterIterator over the actual input text.
    417    //   Return one over an empty string instead - this is the closest
    418    //   we can come to signaling a failure.
    419    //   (GetText() is obsolete, this failure is sort of OK)
    420    fSCharIter.setText(u"", 0);
    421 
    422    if (fCharIter != &fSCharIter) {
    423        // existing fCharIter was adopted from the outside.  Delete it now.
    424        delete fCharIter;
    425    }
    426    fCharIter = &fSCharIter;
    427 
    428    this->first();
    429 }
    430 
    431 
    432 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
    433    UText *result = utext_clone(fillIn, &fText, false, true, &status);
    434    return result;
    435 }
    436 
    437 
    438 //=======================================================================
    439 // BreakIterator overrides
    440 //=======================================================================
    441 
    442 /**
    443 * Return a CharacterIterator over the text being analyzed.
    444 */
    445 CharacterIterator&
    446 RuleBasedBreakIterator::getText() const {
    447    return *fCharIter;
    448 }
    449 
    450 /**
    451 * Set the iterator to analyze a new piece of text.  This function resets
    452 * the current iteration position to the beginning of the text.
    453 * @param newText An iterator over the text to analyze.
    454 */
    455 void
    456 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
    457    // If we are holding a CharacterIterator adopted from a
    458    //   previous call to this function, delete it now.
    459    if (fCharIter != &fSCharIter) {
    460        delete fCharIter;
    461    }
    462 
    463    fCharIter = newText;
    464    UErrorCode status = U_ZERO_ERROR;
    465    fBreakCache->reset();
    466    fDictionaryCache->reset();
    467    if (newText==nullptr || newText->startIndex() != 0) {
    468        // startIndex !=0 wants to be an error, but there's no way to report it.
    469        // Make the iterator text be an empty string.
    470        utext_openUChars(&fText, nullptr, 0, &status);
    471    } else {
    472        utext_openCharacterIterator(&fText, newText, &status);
    473    }
    474    this->first();
    475 }
    476 
    477 /**
    478 * Set the iterator to analyze a new piece of text.  This function resets
    479 * the current iteration position to the beginning of the text.
    480 * @param newText An iterator over the text to analyze.
    481 */
    482 void
    483 RuleBasedBreakIterator::setText(const UnicodeString& newText) {
    484    UErrorCode status = U_ZERO_ERROR;
    485    fBreakCache->reset();
    486    fDictionaryCache->reset();
    487    utext_openConstUnicodeString(&fText, &newText, &status);
    488 
    489    // Set up a character iterator on the string.
    490    //   Needed in case someone calls getText().
    491    //  Can not, unfortunately, do this lazily on the (probably never)
    492    //  call to getText(), because getText is const.
    493    fSCharIter.setText(newText.getBuffer(), newText.length());
    494 
    495    if (fCharIter != &fSCharIter) {
    496        // old fCharIter was adopted from the outside.  Delete it.
    497        delete fCharIter;
    498    }
    499    fCharIter = &fSCharIter;
    500 
    501    this->first();
    502 }
    503 
    504 
    505 /**
    506 *  Provide a new UText for the input text.  Must reference text with contents identical
    507 *  to the original.
    508 *  Intended for use with text data originating in Java (garbage collected) environments
    509 *  where the data may be moved in memory at arbitrary times.
    510 */
    511 RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) {
    512    if (U_FAILURE(status)) {
    513        return *this;
    514    }
    515    if (input == nullptr) {
    516        status = U_ILLEGAL_ARGUMENT_ERROR;
    517        return *this;
    518    }
    519    int64_t pos = utext_getNativeIndex(&fText);
    520    //  Shallow read-only clone of the new UText into the existing input UText
    521    utext_clone(&fText, input, false, true, &status);
    522    if (U_FAILURE(status)) {
    523        return *this;
    524    }
    525    utext_setNativeIndex(&fText, pos);
    526    if (utext_getNativeIndex(&fText) != pos) {
    527        // Sanity check.  The new input utext is supposed to have the exact same
    528        // contents as the old.  If we can't set to the same position, it doesn't.
    529        // The contents underlying the old utext might be invalid at this point,
    530        // so it's not safe to check directly.
    531        status = U_ILLEGAL_ARGUMENT_ERROR;
    532    }
    533    return *this;
    534 }
    535 
    536 
    537 /**
    538 * Sets the current iteration position to the beginning of the text, position zero.
    539 * @return The new iterator position, which is zero.
    540 */
    541 int32_t RuleBasedBreakIterator::first() {
    542    UErrorCode status = U_ZERO_ERROR;
    543    if (!fBreakCache->seek(0)) {
    544        fBreakCache->populateNear(0, status);
    545    }
    546    fBreakCache->current();
    547    U_ASSERT(fPosition == 0);
    548    return 0;
    549 }
    550 
    551 /**
    552 * Sets the current iteration position to the end of the text.
    553 * @return The text's past-the-end offset.
    554 */
    555 int32_t RuleBasedBreakIterator::last() {
    556    int32_t endPos = static_cast<int32_t>(utext_nativeLength(&fText));
    557    UBool endShouldBeBoundary = isBoundary(endPos);      // Has side effect of setting iterator position.
    558    (void)endShouldBeBoundary;
    559    U_ASSERT(endShouldBeBoundary);
    560    U_ASSERT(fPosition == endPos);
    561    return endPos;
    562 }
    563 
    564 /**
    565 * Advances the iterator either forward or backward the specified number of steps.
    566 * Negative values move backward, and positive values move forward.  This is
    567 * equivalent to repeatedly calling next() or previous().
    568 * @param n The number of steps to move.  The sign indicates the direction
    569 * (negative is backwards, and positive is forwards).
    570 * @return The character offset of the boundary position n boundaries away from
    571 * the current one.
    572 */
    573 int32_t RuleBasedBreakIterator::next(int32_t n) {
    574    int32_t result = 0;
    575    if (n > 0) {
    576        for (; n > 0 && result != UBRK_DONE; --n) {
    577            result = next();
    578        }
    579    } else if (n < 0) {
    580        for (; n < 0 && result != UBRK_DONE; ++n) {
    581            result = previous();
    582        }
    583    } else {
    584        result = current();
    585    }
    586    return result;
    587 }
    588 
    589 /**
    590 * Advances the iterator to the next boundary position.
    591 * @return The position of the first boundary after this one.
    592 */
    593 int32_t RuleBasedBreakIterator::next() {
    594    fBreakCache->next();
    595    return fDone ? UBRK_DONE : fPosition;
    596 }
    597 
    598 /**
    599 * Move the iterator backwards, to the boundary preceding the current one.
    600 *
    601 *         Starts from the current position within fText.
    602 *         Starting position need not be on a boundary.
    603 *
    604 * @return The position of the boundary position immediately preceding the starting position.
    605 */
    606 int32_t RuleBasedBreakIterator::previous() {
    607    UErrorCode status = U_ZERO_ERROR;
    608    fBreakCache->previous(status);
    609    return fDone ? UBRK_DONE : fPosition;
    610 }
    611 
    612 /**
    613 * Sets the iterator to refer to the first boundary position following
    614 * the specified position.
    615 * @param startPos The position from which to begin searching for a break position.
    616 * @return The position of the first break after the current position.
    617 */
    618 int32_t RuleBasedBreakIterator::following(int32_t startPos) {
    619    // if the supplied position is before the beginning, return the
    620    // text's starting offset
    621    if (startPos < 0) {
    622        return first();
    623    }
    624 
    625    // Move requested offset to a code point start. It might be on a trail surrogate,
    626    // or on a trail byte if the input is UTF-8. Or it may be beyond the end of the text.
    627    utext_setNativeIndex(&fText, startPos);
    628    startPos = static_cast<int32_t>(utext_getNativeIndex(&fText));
    629 
    630    UErrorCode status = U_ZERO_ERROR;
    631    fBreakCache->following(startPos, status);
    632    return fDone ? UBRK_DONE : fPosition;
    633 }
    634 
    635 /**
    636 * Sets the iterator to refer to the last boundary position before the
    637 * specified position.
    638 * @param offset The position to begin searching for a break from.
    639 * @return The position of the last boundary before the starting position.
    640 */
    641 int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
    642    if (offset > utext_nativeLength(&fText)) {
    643        return last();
    644    }
    645 
    646    // Move requested offset to a code point start. It might be on a trail surrogate,
    647    // or on a trail byte if the input is UTF-8.
    648 
    649    utext_setNativeIndex(&fText, offset);
    650    int32_t adjustedOffset = static_cast<int32_t>(utext_getNativeIndex(&fText));
    651 
    652    UErrorCode status = U_ZERO_ERROR;
    653    fBreakCache->preceding(adjustedOffset, status);
    654    return fDone ? UBRK_DONE : fPosition;
    655 }
    656 
    657 /**
    658 * Returns true if the specified position is a boundary position.  As a side
    659 * effect, leaves the iterator pointing to the first boundary position at
    660 * or after "offset".
    661 *
    662 * @param offset the offset to check.
    663 * @return True if "offset" is a boundary position.
    664 */
    665 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
    666    // out-of-range indexes are never boundary positions
    667    if (offset < 0) {
    668        first();       // For side effects on current position, tag values.
    669        return false;
    670    }
    671 
    672    // Adjust offset to be on a code point boundary and not beyond the end of the text.
    673    // Note that isBoundary() is always false for offsets that are not on code point boundaries.
    674    // But we still need the side effect of leaving iteration at the following boundary.
    675 
    676    utext_setNativeIndex(&fText, offset);
    677    int32_t adjustedOffset = static_cast<int32_t>(utext_getNativeIndex(&fText));
    678 
    679    bool result = false;
    680    UErrorCode status = U_ZERO_ERROR;
    681    if (fBreakCache->seek(adjustedOffset) || fBreakCache->populateNear(adjustedOffset, status)) {
    682        result = (fBreakCache->current() == offset);
    683    }
    684 
    685    if (result && adjustedOffset < offset && utext_char32At(&fText, offset) == U_SENTINEL) {
    686        // Original offset is beyond the end of the text. Return false, it's not a boundary,
    687        // but the iteration position remains set to the end of the text, which is a boundary.
    688        return false;
    689    }
    690    if (!result) {
    691        // Not on a boundary. isBoundary() must leave iterator on the following boundary.
    692        // Cache->seek(), above, left us on the preceding boundary, so advance one.
    693        next();
    694    }
    695    return result;
    696 }
    697 
    698 
    699 /**
    700 * Returns the current iteration position.
    701 * @return The current iteration position.
    702 */
    703 int32_t RuleBasedBreakIterator::current() const {
    704    return fPosition;
    705 }
    706 
    707 
    708 //=======================================================================
    709 // implementation
    710 //=======================================================================
    711 
    712 //
    713 // RBBIRunMode  -  the state machine runs an extra iteration at the beginning and end
    714 //                 of user text.  A variable with this enum type keeps track of where we
    715 //                 are.  The state machine only fetches user input while in the RUN mode.
    716 //
    717 enum RBBIRunMode {
    718    RBBI_START,     // state machine processing is before first char of input
    719    RBBI_RUN,       // state machine processing is in the user text
    720    RBBI_END        // state machine processing is after end of user text.
    721 };
    722 
    723 
    724 // Wrapper functions to select the appropriate handleNext() or handleSafePrevious()
    725 // instantiation, based on whether an 8 or 16 bit table is required.
    726 //
    727 // These Trie access functions will be inlined within the handleNext()/Previous() instantions.
    728 static inline uint16_t TrieFunc8(const UCPTrie *trie, UChar32 c) {
    729    return UCPTRIE_FAST_GET(trie, UCPTRIE_8, c);
    730 }
    731 
    732 static inline uint16_t TrieFunc16(const UCPTrie *trie, UChar32 c) {
    733    return UCPTRIE_FAST_GET(trie, UCPTRIE_16, c);
    734 }
    735 
    736 int32_t RuleBasedBreakIterator::handleNext() {
    737    const RBBIStateTable *statetable = fData->fForwardTable;
    738    bool use8BitsTrie = ucptrie_getValueWidth(fData->fTrie) == UCPTRIE_VALUE_BITS_8;
    739    if (statetable->fFlags & RBBI_8BITS_ROWS) {
    740        if (use8BitsTrie) {
    741            return handleNext<RBBIStateTableRow8, TrieFunc8>();
    742        } else {
    743            return handleNext<RBBIStateTableRow8, TrieFunc16>();
    744        }
    745    } else {
    746        if (use8BitsTrie) {
    747            return handleNext<RBBIStateTableRow16, TrieFunc8>();
    748        } else {
    749            return handleNext<RBBIStateTableRow16, TrieFunc16>();
    750        }
    751    }
    752 }
    753 
    754 int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
    755    const RBBIStateTable *statetable = fData->fReverseTable;
    756    bool use8BitsTrie = ucptrie_getValueWidth(fData->fTrie) == UCPTRIE_VALUE_BITS_8;
    757    if (statetable->fFlags & RBBI_8BITS_ROWS) {
    758        if (use8BitsTrie) {
    759            return handleSafePrevious<RBBIStateTableRow8, TrieFunc8>(fromPosition);
    760        } else {
    761            return handleSafePrevious<RBBIStateTableRow8, TrieFunc16>(fromPosition);
    762        }
    763    } else {
    764        if (use8BitsTrie) {
    765            return handleSafePrevious<RBBIStateTableRow16, TrieFunc8>(fromPosition);
    766        } else {
    767            return handleSafePrevious<RBBIStateTableRow16, TrieFunc16>(fromPosition);
    768        }
    769    }
    770 }
    771 
    772 
    773 //-----------------------------------------------------------------------------------
    774 //
    775 //  handleNext()
    776 //     Run the state machine to find a boundary
    777 //
    778 //-----------------------------------------------------------------------------------
    779 template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc>
    780 int32_t RuleBasedBreakIterator::handleNext() {
    781    int32_t             state;
    782    uint16_t            category        = 0;
    783    RBBIRunMode         mode;
    784 
    785    RowType             *row;
    786    UChar32             c;
    787    int32_t             result             = 0;
    788    int32_t             initialPosition    = 0;
    789    const RBBIStateTable *statetable       = fData->fForwardTable;
    790    const char         *tableData          = statetable->fTableData;
    791    uint32_t            tableRowLen        = statetable->fRowLen;
    792    uint32_t            dictStart          = statetable->fDictCategoriesStart;
    793    #ifdef RBBI_DEBUG
    794        if (gTrace) {
    795            RBBIDebugPuts("Handle Next   pos   char  state category");
    796        }
    797    #endif
    798 
    799    // handleNext always sets the break tag value.
    800    // Set the default for it.
    801    fRuleStatusIndex = 0;
    802 
    803    fDictionaryCharCount = 0;
    804 
    805    // if we're already at the end of the text, return DONE.
    806    initialPosition = fPosition;
    807    UTEXT_SETNATIVEINDEX(&fText, initialPosition);
    808    result          = initialPosition;
    809    c               = UTEXT_NEXT32(&fText);
    810    if (c==U_SENTINEL) {
    811        fDone = true;
    812        return UBRK_DONE;
    813    }
    814 
    815    //  Set the initial state for the state machine
    816    state = START_STATE;
    817    row = (RowType *)
    818            //(statetable->fTableData + (statetable->fRowLen * state));
    819            (tableData + tableRowLen * state);
    820 
    821 
    822    mode     = RBBI_RUN;
    823    if (statetable->fFlags & RBBI_BOF_REQUIRED) {
    824        category = 2;
    825        mode     = RBBI_START;
    826    }
    827 
    828 
    829    // loop until we reach the end of the text or transition to state 0
    830    //
    831    for (;;) {
    832        if (c == U_SENTINEL) {
    833            // Reached end of input string.
    834            if (mode == RBBI_END) {
    835                // We have already run the loop one last time with the
    836                //   character set to the psueudo {eof} value.  Now it is time
    837                //   to unconditionally bail out.
    838                break;
    839            }
    840            // Run the loop one last time with the fake end-of-input character category.
    841            mode = RBBI_END;
    842            category = 1;
    843        }
    844 
    845        //
    846        // Get the char category.  An incoming category of 1 or 2 means that
    847        //      we are preset for doing the beginning or end of input, and
    848        //      that we shouldn't get a category from an actual text input character.
    849        //
    850        if (mode == RBBI_RUN) {
    851            // look up the current character's character category, which tells us
    852            // which column in the state table to look at.
    853            category = trieFunc(fData->fTrie, c);
    854            fDictionaryCharCount += (category >= dictStart);
    855        }
    856 
    857       #ifdef RBBI_DEBUG
    858            if (gTrace) {
    859                RBBIDebugPrintf("             %4" PRId64 "   ", utext_getNativeIndex(&fText));
    860                if (0x20<=c && c<0x7f) {
    861                    RBBIDebugPrintf("\"%c\"  ", c);
    862                } else {
    863                    RBBIDebugPrintf("%5x  ", c);
    864                }
    865                RBBIDebugPrintf("%3d  %3d\n", state, category);
    866            }
    867        #endif
    868 
    869        // State Transition - move machine to its next state
    870        //
    871 
    872        // fNextState is a variable-length array.
    873        U_ASSERT(category<fData->fHeader->fCatCount);
    874        state = row->fNextState[category];  /*Not accessing beyond memory*/
    875        row = (RowType *)
    876            // (statetable->fTableData + (statetable->fRowLen * state));
    877            (tableData + tableRowLen * state);
    878 
    879 
    880        uint16_t accepting = row->fAccepting;
    881        if (accepting == ACCEPTING_UNCONDITIONAL) {
    882            // Match found, common case.
    883            if (mode != RBBI_START) {
    884                result = static_cast<int32_t>(UTEXT_GETNATIVEINDEX(&fText));
    885            }
    886            fRuleStatusIndex = row->fTagsIdx;   // Remember the break status (tag) values.
    887        } else if (accepting > ACCEPTING_UNCONDITIONAL) {
    888            // Lookahead match is completed.
    889            U_ASSERT(accepting < fData->fForwardTable->fLookAheadResultsSize);
    890            int32_t lookaheadResult = fLookAheadMatches[accepting];
    891            if (lookaheadResult >= 0) {
    892                fRuleStatusIndex = row->fTagsIdx;
    893                fPosition = lookaheadResult;
    894                return lookaheadResult;
    895            }
    896        }
    897 
    898        // If we are at the position of the '/' in a look-ahead (hard break) rule;
    899        // record the current position, to be returned later, if the full rule matches.
    900        // TODO: Move this check before the previous check of fAccepting.
    901        //       This would enable hard-break rules with no following context.
    902        //       But there are line break test failures when trying this. Investigate.
    903        //       Issue ICU-20837
    904        uint16_t rule = row->fLookAhead;
    905        U_ASSERT(rule == 0 || rule > ACCEPTING_UNCONDITIONAL);
    906        U_ASSERT(rule == 0 || rule < fData->fForwardTable->fLookAheadResultsSize);
    907        if (rule > ACCEPTING_UNCONDITIONAL) {
    908            int32_t pos = static_cast<int32_t>(UTEXT_GETNATIVEINDEX(&fText));
    909            fLookAheadMatches[rule] = pos;
    910        }
    911 
    912        if (state == STOP_STATE) {
    913            // This is the normal exit from the lookup state machine.
    914            // We have advanced through the string until it is certain that no
    915            //   longer match is possible, no matter what characters follow.
    916            break;
    917        }
    918 
    919        // Advance to the next character.
    920        // If this is a beginning-of-input loop iteration, don't advance
    921        //    the input position.  The next iteration will be processing the
    922        //    first real input character.
    923        if (mode == RBBI_RUN) {
    924            c = UTEXT_NEXT32(&fText);
    925        } else {
    926            if (mode == RBBI_START) {
    927                mode = RBBI_RUN;
    928            }
    929        }
    930    }
    931 
    932    // The state machine is done.  Check whether it found a match...
    933 
    934    // If the iterator failed to advance in the match engine, force it ahead by one.
    935    //   (This really indicates a defect in the break rules.  They should always match
    936    //    at least one character.)
    937    if (result == initialPosition) {
    938        utext_setNativeIndex(&fText, initialPosition);
    939        utext_next32(&fText);
    940        result = static_cast<int32_t>(utext_getNativeIndex(&fText));
    941        fRuleStatusIndex = 0;
    942    }
    943 
    944    // Leave the iterator at our result position.
    945    fPosition = result;
    946    #ifdef RBBI_DEBUG
    947        if (gTrace) {
    948            RBBIDebugPrintf("result = %d\n\n", result);
    949        }
    950    #endif
    951    return result;
    952 }
    953 
    954 
    955 //-----------------------------------------------------------------------------------
    956 //
    957 //  handleSafePrevious()
    958 //
    959 //      Iterate backwards using the safe reverse rules.
    960 //      The logic of this function is similar to handleNext(), but simpler
    961 //      because the safe table does not require as many options.
    962 //
    963 //-----------------------------------------------------------------------------------
    964 template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc>
    965 int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
    966 
    967    int32_t             state;
    968    uint16_t            category        = 0;
    969    RowType            *row;
    970    UChar32             c;
    971    int32_t             result          = 0;
    972 
    973    const RBBIStateTable *stateTable = fData->fReverseTable;
    974    UTEXT_SETNATIVEINDEX(&fText, fromPosition);
    975    #ifdef RBBI_DEBUG
    976        if (gTrace) {
    977            RBBIDebugPuts("Handle Previous   pos   char  state category");
    978        }
    979    #endif
    980 
    981    // if we're already at the start of the text, return DONE.
    982    if (fData == nullptr || UTEXT_GETNATIVEINDEX(&fText)==0) {
    983        return BreakIterator::DONE;
    984    }
    985 
    986    //  Set the initial state for the state machine
    987    c = UTEXT_PREVIOUS32(&fText);
    988    state = START_STATE;
    989    row = (RowType *)
    990            (stateTable->fTableData + (stateTable->fRowLen * state));
    991 
    992    // loop until we reach the start of the text or transition to state 0
    993    //
    994    for (; c != U_SENTINEL; c = UTEXT_PREVIOUS32(&fText)) {
    995 
    996        // look up the current character's character category, which tells us
    997        // which column in the state table to look at.
    998        //
    999        //  Off the dictionary flag bit. For reverse iteration it is not used.
   1000        category = trieFunc(fData->fTrie, c);
   1001 
   1002        #ifdef RBBI_DEBUG
   1003            if (gTrace) {
   1004                RBBIDebugPrintf("             %4d   ", (int32_t)utext_getNativeIndex(&fText));
   1005                if (0x20<=c && c<0x7f) {
   1006                    RBBIDebugPrintf("\"%c\"  ", c);
   1007                } else {
   1008                    RBBIDebugPrintf("%5x  ", c);
   1009                }
   1010                RBBIDebugPrintf("%3d  %3d\n", state, category);
   1011            }
   1012        #endif
   1013 
   1014        // State Transition - move machine to its next state
   1015        //
   1016        // fNextState is a variable-length array.
   1017        U_ASSERT(category<fData->fHeader->fCatCount);
   1018        state = row->fNextState[category];  /*Not accessing beyond memory*/
   1019        row = (RowType *)
   1020            (stateTable->fTableData + (stateTable->fRowLen * state));
   1021 
   1022        if (state == STOP_STATE) {
   1023            // This is the normal exit from the lookup state machine.
   1024            // Transition to state zero means we have found a safe point.
   1025            break;
   1026        }
   1027    }
   1028 
   1029    // The state machine is done.  Check whether it found a match...
   1030    result = static_cast<int32_t>(UTEXT_GETNATIVEINDEX(&fText));
   1031    #ifdef RBBI_DEBUG
   1032        if (gTrace) {
   1033            RBBIDebugPrintf("result = %d\n\n", result);
   1034        }
   1035    #endif
   1036    return result;
   1037 }
   1038 
   1039 
   1040 //-------------------------------------------------------------------------------
   1041 //
   1042 //   getRuleStatus()   Return the break rule tag associated with the current
   1043 //                     iterator position.  If the iterator arrived at its current
   1044 //                     position by iterating forwards, the value will have been
   1045 //                     cached by the handleNext() function.
   1046 //
   1047 //-------------------------------------------------------------------------------
   1048 
   1049 int32_t  RuleBasedBreakIterator::getRuleStatus() const {
   1050 
   1051    // fLastRuleStatusIndex indexes to the start of the appropriate status record
   1052    //                                                 (the number of status values.)
   1053    //   This function returns the last (largest) of the array of status values.
   1054    int32_t  idx = fRuleStatusIndex + fData->fRuleStatusTable[fRuleStatusIndex];
   1055    int32_t  tagVal = fData->fRuleStatusTable[idx];
   1056 
   1057    return tagVal;
   1058 }
   1059 
   1060 
   1061 int32_t RuleBasedBreakIterator::getRuleStatusVec(
   1062             int32_t *fillInVec, int32_t capacity, UErrorCode &status) {
   1063    if (U_FAILURE(status)) {
   1064        return 0;
   1065    }
   1066 
   1067    int32_t  numVals = fData->fRuleStatusTable[fRuleStatusIndex];
   1068    int32_t  numValsToCopy = numVals;
   1069    if (numVals > capacity) {
   1070        status = U_BUFFER_OVERFLOW_ERROR;
   1071        numValsToCopy = capacity;
   1072    }
   1073    int i;
   1074    for (i=0; i<numValsToCopy; i++) {
   1075        fillInVec[i] = fData->fRuleStatusTable[fRuleStatusIndex + i + 1];
   1076    }
   1077    return numVals;
   1078 }
   1079 
   1080 
   1081 
   1082 //-------------------------------------------------------------------------------
   1083 //
   1084 //   getBinaryRules        Access to the compiled form of the rules,
   1085 //                         for use by build system tools that save the data
   1086 //                         for standard iterator types.
   1087 //
   1088 //-------------------------------------------------------------------------------
   1089 const uint8_t  *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
   1090    const uint8_t  *retPtr = nullptr;
   1091    length = 0;
   1092 
   1093    if (fData != nullptr) {
   1094        retPtr = reinterpret_cast<const uint8_t*>(fData->fHeader);
   1095        length = fData->fHeader->fLength;
   1096    }
   1097    return retPtr;
   1098 }
   1099 
   1100 
   1101 RuleBasedBreakIterator *RuleBasedBreakIterator::createBufferClone(
   1102        void * /*stackBuffer*/, int32_t &bufferSize, UErrorCode &status) {
   1103    if (U_FAILURE(status)){
   1104        return nullptr;
   1105    }
   1106 
   1107    if (bufferSize == 0) {
   1108        bufferSize = 1;  // preflighting for deprecated functionality
   1109        return nullptr;
   1110    }
   1111 
   1112    BreakIterator *clonedBI = clone();
   1113    if (clonedBI == nullptr) {
   1114        status = U_MEMORY_ALLOCATION_ERROR;
   1115    } else {
   1116        status = U_SAFECLONE_ALLOCATED_WARNING;
   1117    }
   1118    return (RuleBasedBreakIterator *)clonedBI;
   1119 }
   1120 
   1121 U_NAMESPACE_END
   1122 
   1123 
   1124 static icu::UStack *gLanguageBreakFactories = nullptr;
   1125 static const icu::UnicodeString *gEmptyString = nullptr;
   1126 static icu::UInitOnce gLanguageBreakFactoriesInitOnce {};
   1127 static icu::UInitOnce gRBBIInitOnce {};
   1128 static icu::ICULanguageBreakFactory *gICULanguageBreakFactory = nullptr;
   1129 
   1130 /**
   1131 * Release all static memory held by breakiterator.
   1132 */
   1133 U_CDECL_BEGIN
   1134 UBool U_CALLCONV rbbi_cleanup() {
   1135    delete gLanguageBreakFactories;
   1136    gLanguageBreakFactories = nullptr;
   1137    delete gEmptyString;
   1138    gEmptyString = nullptr;
   1139    gLanguageBreakFactoriesInitOnce.reset();
   1140    gRBBIInitOnce.reset();
   1141    return true;
   1142 }
   1143 U_CDECL_END
   1144 
   1145 U_CDECL_BEGIN
   1146 static void U_CALLCONV _deleteFactory(void *obj) {
   1147    delete (icu::LanguageBreakFactory *) obj;
   1148 }
   1149 U_CDECL_END
   1150 U_NAMESPACE_BEGIN
   1151 
   1152 static void U_CALLCONV rbbiInit() {
   1153    gEmptyString = new UnicodeString();
   1154    ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup);
   1155 }
   1156 
   1157 static void U_CALLCONV initLanguageFactories(UErrorCode& status) {
   1158    U_ASSERT(gLanguageBreakFactories == nullptr);
   1159    gLanguageBreakFactories = new UStack(_deleteFactory, nullptr, status);
   1160    if (gLanguageBreakFactories != nullptr && U_SUCCESS(status)) {
   1161        LocalPointer<ICULanguageBreakFactory> factory(new ICULanguageBreakFactory(status), status);
   1162        if (U_SUCCESS(status)) {
   1163            gICULanguageBreakFactory = factory.orphan();
   1164            gLanguageBreakFactories->push(gICULanguageBreakFactory, status);
   1165 #ifdef U_LOCAL_SERVICE_HOOK
   1166            LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
   1167            if (extra != nullptr) {
   1168                gLanguageBreakFactories->push(extra, status);
   1169            }
   1170 #endif
   1171        }
   1172    }
   1173    ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup);
   1174 }
   1175 
   1176 void ensureLanguageFactories(UErrorCode& status) {
   1177    umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories, status);
   1178 }
   1179 
   1180 static const LanguageBreakEngine*
   1181 getLanguageBreakEngineFromFactory(UChar32 c, const char* locale)
   1182 {
   1183    UErrorCode status = U_ZERO_ERROR;
   1184    ensureLanguageFactories(status);
   1185    if (U_FAILURE(status)) return nullptr;
   1186 
   1187    int32_t i = gLanguageBreakFactories->size();
   1188    const LanguageBreakEngine *lbe = nullptr;
   1189    while (--i >= 0) {
   1190        LanguageBreakFactory* factory = static_cast<LanguageBreakFactory*>(gLanguageBreakFactories->elementAt(i));
   1191        lbe = factory->getEngineFor(c, locale);
   1192        if (lbe != nullptr) {
   1193            break;
   1194        }
   1195    }
   1196    return lbe;
   1197 }
   1198 
   1199 
   1200 //-------------------------------------------------------------------------------
   1201 //
   1202 //  getLanguageBreakEngine  Find an appropriate LanguageBreakEngine for the
   1203 //                          the character c.
   1204 //
   1205 //-------------------------------------------------------------------------------
   1206 const LanguageBreakEngine *
   1207 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c, const char* locale) {
   1208    const LanguageBreakEngine *lbe = nullptr;
   1209    UErrorCode status = U_ZERO_ERROR;
   1210 
   1211    if (fLanguageBreakEngines == nullptr) {
   1212        fLanguageBreakEngines = new UStack(status);
   1213        if (fLanguageBreakEngines == nullptr || U_FAILURE(status)) {
   1214            delete fLanguageBreakEngines;
   1215            fLanguageBreakEngines = nullptr;
   1216            return nullptr;
   1217        }
   1218    }
   1219 
   1220    int32_t i = fLanguageBreakEngines->size();
   1221    while (--i >= 0) {
   1222        lbe = static_cast<const LanguageBreakEngine*>(fLanguageBreakEngines->elementAt(i));
   1223        if (lbe->handles(c, locale)) {
   1224            return lbe;
   1225        }
   1226    }
   1227 
   1228    // No existing dictionary took the character. See if a factory wants to
   1229    // give us a new LanguageBreakEngine for this character.
   1230    lbe = getLanguageBreakEngineFromFactory(c, locale);
   1231 
   1232    // If we got one, use it and push it on our stack.
   1233    if (lbe != nullptr) {
   1234        fLanguageBreakEngines->push((void *)lbe, status);
   1235        // Even if we can't remember it, we can keep looking it up, so
   1236        // return it even if the push fails.
   1237        return lbe;
   1238    }
   1239 
   1240    // No engine is forthcoming for this character. Add it to the
   1241    // reject set. Create the reject break engine if needed.
   1242    if (fUnhandledBreakEngine == nullptr) {
   1243        fUnhandledBreakEngine = new UnhandledEngine(status);
   1244        if (U_SUCCESS(status) && fUnhandledBreakEngine == nullptr) {
   1245            status = U_MEMORY_ALLOCATION_ERROR;
   1246            return nullptr;
   1247        }
   1248        // Put it last so that scripts for which we have an engine get tried
   1249        // first.
   1250        fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
   1251        // If we can't insert it, or creation failed, get rid of it
   1252        U_ASSERT(!fLanguageBreakEngines->hasDeleter());
   1253        if (U_FAILURE(status)) {
   1254            delete fUnhandledBreakEngine;
   1255            fUnhandledBreakEngine = nullptr;
   1256            return nullptr;
   1257        }
   1258    }
   1259 
   1260    // Tell the reject engine about the character; at its discretion, it may
   1261    // add more than just the one character.
   1262    fUnhandledBreakEngine->handleCharacter(c);
   1263 
   1264    return fUnhandledBreakEngine;
   1265 }
   1266 
   1267 #ifndef U_HIDE_DRAFT_API
   1268 void U_EXPORT2 RuleBasedBreakIterator::registerExternalBreakEngine(
   1269                  ExternalBreakEngine* toAdopt, UErrorCode& status) {
   1270    LocalPointer<ExternalBreakEngine> engine(toAdopt, status);
   1271    if (U_FAILURE(status)) return;
   1272    ensureLanguageFactories(status);
   1273    if (U_FAILURE(status)) return;
   1274    gICULanguageBreakFactory->addExternalEngine(engine.orphan(), status);
   1275 }
   1276 #endif  /* U_HIDE_DRAFT_API */
   1277 
   1278 
   1279 void RuleBasedBreakIterator::dumpCache() {
   1280    fBreakCache->dumpCache();
   1281 }
   1282 
   1283 void RuleBasedBreakIterator::dumpTables() {
   1284    fData->printData();
   1285 }
   1286 
   1287 /**
   1288 * Returns the description used to create this iterator
   1289 */
   1290 
   1291 const UnicodeString&
   1292 RuleBasedBreakIterator::getRules() const {
   1293    if (fData != nullptr) {
   1294        return fData->getRuleSourceString();
   1295    } else {
   1296        umtx_initOnce(gRBBIInitOnce, &rbbiInit);
   1297        return *gEmptyString;
   1298    }
   1299 }
   1300 
   1301 U_NAMESPACE_END
   1302 
   1303 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */