rbbi_cache.cpp (26912B)
1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 4 // file: rbbi_cache.cpp 5 6 #include "unicode/utypes.h" 7 8 #if !UCONFIG_NO_BREAK_ITERATION 9 10 #include "unicode/ubrk.h" 11 #include "unicode/rbbi.h" 12 13 #include "rbbi_cache.h" 14 15 #include "brkeng.h" 16 #include "cmemory.h" 17 #include "rbbidata.h" 18 #include "rbbirb.h" 19 #include "uassert.h" 20 #include "uvectr32.h" 21 22 U_NAMESPACE_BEGIN 23 24 /* 25 * DictionaryCache implementation 26 */ 27 28 RuleBasedBreakIterator::DictionaryCache::DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status) : 29 fBI(bi), fBreaks(status), fPositionInCache(-1), 30 fStart(0), fLimit(0), fFirstRuleStatusIndex(0), fOtherRuleStatusIndex(0) { 31 } 32 33 RuleBasedBreakIterator::DictionaryCache::~DictionaryCache() { 34 } 35 36 void RuleBasedBreakIterator::DictionaryCache::reset() { 37 fPositionInCache = -1; 38 fStart = 0; 39 fLimit = 0; 40 fFirstRuleStatusIndex = 0; 41 fOtherRuleStatusIndex = 0; 42 fBreaks.removeAllElements(); 43 } 44 45 UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_t *result, int32_t *statusIndex) { 46 if (fromPos >= fLimit || fromPos < fStart) { 47 fPositionInCache = -1; 48 return false; 49 } 50 51 // Sequential iteration, move from previous boundary to the following 52 53 int32_t r = 0; 54 if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) { 55 ++fPositionInCache; 56 if (fPositionInCache >= fBreaks.size()) { 57 fPositionInCache = -1; 58 return false; 59 } 60 r = fBreaks.elementAti(fPositionInCache); 61 U_ASSERT(r > fromPos); 62 *result = r; 63 *statusIndex = fOtherRuleStatusIndex; 64 return true; 65 } 66 67 // Random indexing. Linear search for the boundary following the given position. 68 69 for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) { 70 r= fBreaks.elementAti(fPositionInCache); 71 if (r > fromPos) { 72 *result = r; 73 *statusIndex = fOtherRuleStatusIndex; 74 return true; 75 } 76 } 77 UPRV_UNREACHABLE_EXIT; 78 } 79 80 81 UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_t *result, int32_t *statusIndex) { 82 if (fromPos <= fStart || fromPos > fLimit) { 83 fPositionInCache = -1; 84 return false; 85 } 86 87 if (fromPos == fLimit) { 88 fPositionInCache = fBreaks.size() - 1; 89 if (fPositionInCache >= 0) { 90 U_ASSERT(fBreaks.elementAti(fPositionInCache) == fromPos); 91 } 92 } 93 94 int32_t r; 95 if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) { 96 --fPositionInCache; 97 r = fBreaks.elementAti(fPositionInCache); 98 U_ASSERT(r < fromPos); 99 *result = r; 100 *statusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; 101 return true; 102 } 103 104 if (fPositionInCache == 0) { 105 fPositionInCache = -1; 106 return false; 107 } 108 109 for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) { 110 r = fBreaks.elementAti(fPositionInCache); 111 if (r < fromPos) { 112 *result = r; 113 *statusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; 114 return true; 115 } 116 } 117 UPRV_UNREACHABLE_EXIT; 118 } 119 120 void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPos, int32_t endPos, 121 int32_t firstRuleStatus, int32_t otherRuleStatus) { 122 if ((endPos - startPos) <= 1) { 123 return; 124 } 125 126 reset(); 127 fFirstRuleStatusIndex = firstRuleStatus; 128 fOtherRuleStatusIndex = otherRuleStatus; 129 130 int32_t rangeStart = startPos; 131 int32_t rangeEnd = endPos; 132 133 uint16_t category; 134 int32_t current; 135 UErrorCode status = U_ZERO_ERROR; 136 int32_t foundBreakCount = 0; 137 UText *text = &fBI->fText; 138 139 // Loop through the text, looking for ranges of dictionary characters. 140 // For each span, find the appropriate break engine, and ask it to find 141 // any breaks within the span. 142 143 utext_setNativeIndex(text, rangeStart); 144 UChar32 c = utext_current32(text); 145 category = ucptrie_get(fBI->fData->fTrie, c); 146 uint32_t dictStart = fBI->fData->fForwardTable->fDictCategoriesStart; 147 148 while(U_SUCCESS(status)) { 149 while ((current = static_cast<int32_t>(UTEXT_GETNATIVEINDEX(text))) < rangeEnd 150 && (category < dictStart)) { 151 utext_next32(text); // TODO: cleaner loop structure. 152 c = utext_current32(text); 153 category = ucptrie_get(fBI->fData->fTrie, c); 154 } 155 if (current >= rangeEnd) { 156 break; 157 } 158 159 // We now have a dictionary character. Get the appropriate language object 160 // to deal with it. 161 const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine( 162 c, fBI->getLocaleID(ULOC_REQUESTED_LOCALE, status)); 163 164 // Ask the language object if there are any breaks. It will add them to the cache and 165 // leave the text pointer on the other side of its range, ready to search for the next one. 166 if (lbe != nullptr) { 167 foundBreakCount += lbe->findBreaks(text, current, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status); 168 } 169 170 // Reload the loop variables for the next go-round 171 c = utext_current32(text); 172 category = ucptrie_get(fBI->fData->fTrie, c); 173 } 174 175 // If we found breaks, ensure that the first and last entries are 176 // the original starting and ending position. And initialize the 177 // cache iteration position to the first entry. 178 179 // printf("foundBreakCount = %d\n", foundBreakCount); 180 if (foundBreakCount > 0) { 181 U_ASSERT(foundBreakCount == fBreaks.size()); 182 if (startPos < fBreaks.elementAti(0)) { 183 // The dictionary did not place a boundary at the start of the segment of text. 184 // Add one now. This should not commonly happen, but it would be easy for interactions 185 // of the rules for dictionary segments and the break engine implementations to 186 // inadvertently cause it. Cover it here, just in case. 187 fBreaks.insertElementAt(startPos, 0, status); 188 } 189 if (endPos > fBreaks.peeki()) { 190 fBreaks.push(endPos, status); 191 } 192 fPositionInCache = 0; 193 // Note: Dictionary matching may extend beyond the original limit. 194 fStart = fBreaks.elementAti(0); 195 fLimit = fBreaks.peeki(); 196 } else { 197 // there were no language-based breaks, even though the segment contained 198 // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache 199 // for this range will fail, and the calling code will fall back to the rule based boundaries. 200 } 201 } 202 203 204 /* 205 * BreakCache implementation 206 */ 207 208 RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) : 209 fBI(bi), fSideBuffer(status) { 210 reset(); 211 } 212 213 214 RuleBasedBreakIterator::BreakCache::~BreakCache() { 215 } 216 217 218 void RuleBasedBreakIterator::BreakCache::reset(int32_t pos, int32_t ruleStatus) { 219 fStartBufIdx = 0; 220 fEndBufIdx = 0; 221 fTextIdx = pos; 222 fBufIdx = 0; 223 fBoundaries[0] = pos; 224 fStatuses[0] = static_cast<uint16_t>(ruleStatus); 225 } 226 227 228 int32_t RuleBasedBreakIterator::BreakCache::current() { 229 fBI->fPosition = fTextIdx; 230 fBI->fRuleStatusIndex = fStatuses[fBufIdx]; 231 fBI->fDone = false; 232 return fTextIdx; 233 } 234 235 236 void RuleBasedBreakIterator::BreakCache::following(int32_t startPos, UErrorCode &status) { 237 if (U_FAILURE(status)) { 238 return; 239 } 240 if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) { 241 // startPos is in the cache. Do a next() from that position. 242 // TODO: an awkward set of interactions with bi->fDone 243 // seek() does not clear it; it can't because of interactions with populateNear(). 244 // next() does not clear it in the fast-path case, where everything matters. Maybe it should. 245 // So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end. 246 fBI->fDone = false; 247 next(); 248 } 249 } 250 251 252 void RuleBasedBreakIterator::BreakCache::preceding(int32_t startPos, UErrorCode &status) { 253 if (U_FAILURE(status)) { 254 return; 255 } 256 if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) { 257 if (startPos == fTextIdx) { 258 previous(status); 259 } else { 260 // seek() leaves the BreakCache positioned at the preceding boundary 261 // if the requested position is between two boundaries. 262 // current() pushes the BreakCache position out to the BreakIterator itself. 263 U_ASSERT(startPos > fTextIdx); 264 current(); 265 } 266 } 267 } 268 269 270 /* 271 * Out-of-line code for BreakCache::next(). 272 * Cache does not already contain the boundary 273 */ 274 void RuleBasedBreakIterator::BreakCache::nextOL() { 275 fBI->fDone = !populateFollowing(); 276 fBI->fPosition = fTextIdx; 277 fBI->fRuleStatusIndex = fStatuses[fBufIdx]; 278 } 279 280 281 void RuleBasedBreakIterator::BreakCache::previous(UErrorCode &status) { 282 if (U_FAILURE(status)) { 283 return; 284 } 285 int32_t initialBufIdx = fBufIdx; 286 if (fBufIdx == fStartBufIdx) { 287 // At start of cache. Prepend to it. 288 populatePreceding(status); 289 } else { 290 // Cache already holds the next boundary 291 fBufIdx = modChunkSize(fBufIdx - 1); 292 fTextIdx = fBoundaries[fBufIdx]; 293 } 294 fBI->fDone = (fBufIdx == initialBufIdx); 295 fBI->fPosition = fTextIdx; 296 fBI->fRuleStatusIndex = fStatuses[fBufIdx]; 297 } 298 299 300 UBool RuleBasedBreakIterator::BreakCache::seek(int32_t pos) { 301 if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) { 302 return false; 303 } 304 if (pos == fBoundaries[fStartBufIdx]) { 305 // Common case: seek(0), from BreakIterator::first() 306 fBufIdx = fStartBufIdx; 307 fTextIdx = fBoundaries[fBufIdx]; 308 return true; 309 } 310 if (pos == fBoundaries[fEndBufIdx]) { 311 fBufIdx = fEndBufIdx; 312 fTextIdx = fBoundaries[fBufIdx]; 313 return true; 314 } 315 316 int32_t min = fStartBufIdx; 317 int32_t max = fEndBufIdx; 318 while (min != max) { 319 int32_t probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2; 320 probe = modChunkSize(probe); 321 if (fBoundaries[probe] > pos) { 322 max = probe; 323 } else { 324 min = modChunkSize(probe + 1); 325 } 326 } 327 U_ASSERT(fBoundaries[max] > pos); 328 fBufIdx = modChunkSize(max - 1); 329 fTextIdx = fBoundaries[fBufIdx]; 330 U_ASSERT(fTextIdx <= pos); 331 return true; 332 } 333 334 335 UBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorCode &status) { 336 if (U_FAILURE(status)) { 337 return false; 338 } 339 U_ASSERT(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]); 340 341 // Add boundaries to the cache near the specified position. 342 // The given position need not be a boundary itself. 343 // The input position must be within the range of the text, and 344 // on a code point boundary. 345 // If the requested position is a break boundary, leave the iteration 346 // position on it. 347 // If the requested position is not a boundary, leave the iteration 348 // position on the preceding boundary and include both the 349 // preceding and following boundaries in the cache. 350 // Additional boundaries, either preceding or following, may be added 351 // to the cache as a side effect. 352 353 // If the requested position is not near already cached positions, clear the existing cache, 354 // find a near-by boundary and begin new cache contents there. 355 356 // Threshold for a text position to be considered near to existing cache contents. 357 // TODO: See issue ICU-22024 "perf tuning of Cache needed." 358 // This value is subject to change. See the ticket for more details. 359 static constexpr int32_t CACHE_NEAR = 15; 360 361 int32_t aBoundary = -1; 362 int32_t ruleStatusIndex = 0; 363 bool retainCache = false; 364 if ((position > fBoundaries[fStartBufIdx] - CACHE_NEAR) && position < (fBoundaries[fEndBufIdx] + CACHE_NEAR)) { 365 // Requested position is near the existing cache. Retain it. 366 retainCache = true; 367 } else if (position <= CACHE_NEAR) { 368 // Requested position is near the start of the text. Fill cache from start, skipping 369 // the need to find a safe point. 370 retainCache = false; 371 aBoundary = 0; 372 } else { 373 // Requested position is not near the existing cache. 374 // Find a safe point to refill the cache from. 375 int32_t backupPos = fBI->handleSafePrevious(position); 376 377 if (fBoundaries[fEndBufIdx] < position && fBoundaries[fEndBufIdx] >= (backupPos - CACHE_NEAR)) { 378 // The requested position is beyond the end of the existing cache, but the 379 // reverse rules produced a position near or before the cached region. 380 // Retain the existing cache, and fill from the end of it. 381 retainCache = true; 382 } else if (backupPos < CACHE_NEAR) { 383 // The safe reverse rules moved us to near the start of text. 384 // Take that (index 0) as the backup boundary, avoiding the complication 385 // (in the following block) of moving forward from the safe point to a known boundary. 386 // 387 // Retain the cache if it begins not too far from the requested position. 388 aBoundary = 0; 389 retainCache = (fBoundaries[fStartBufIdx] <= (position + CACHE_NEAR)); 390 } else { 391 // The safe reverse rules produced a position that is neither near the existing 392 // cache, nor near the start of text. 393 // Advance to the boundary following. 394 // There is a complication: the safe reverse rules identify pairs of code points 395 // that are safe. If advancing from the safe point moves forwards by less than 396 // two code points, we need to advance one more time to ensure that the boundary 397 // is good, including a correct rules status value. 398 retainCache = false; 399 fBI->fPosition = backupPos; 400 aBoundary = fBI->handleNext(); 401 if (aBoundary != UBRK_DONE && aBoundary <= backupPos + 4) { 402 // +4 is a quick test for possibly having advanced only one codepoint. 403 // Four being the length of the longest potential code point, a supplementary in UTF-8 404 utext_setNativeIndex(&fBI->fText, aBoundary); 405 if (backupPos == utext_getPreviousNativeIndex(&fBI->fText)) { 406 // The initial handleNext() only advanced by a single code point. Go again. 407 aBoundary = fBI->handleNext(); // Safe rules identify safe pairs. 408 } 409 } 410 if (aBoundary == UBRK_DONE) { 411 // Note (Andy Heninger): I don't think this condition can occur, but it's hard 412 // to prove that it can't. We ran off the end of the string looking a boundary 413 // following a safe point; choose the end of the string as that boundary. 414 aBoundary = utext_nativeLength(&fBI->fText); 415 } 416 ruleStatusIndex = fBI->fRuleStatusIndex; 417 } 418 } 419 420 if (!retainCache) { 421 U_ASSERT(aBoundary != -1); 422 reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point. 423 } 424 425 // Fill in boundaries between existing cache content and the new requested position. 426 427 if (fBoundaries[fEndBufIdx] < position) { 428 // The last position in the cache precedes the requested position. 429 // Add following position(s) to the cache. 430 while (fBoundaries[fEndBufIdx] < position) { 431 if (!populateFollowing()) { 432 UPRV_UNREACHABLE_EXIT; 433 } 434 } 435 fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer. 436 fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra boundaries. 437 while (fTextIdx > position) { // Move backwards to a position at or preceding the requested pos. 438 previous(status); 439 } 440 return true; 441 } 442 443 if (fBoundaries[fStartBufIdx] > position) { 444 // The first position in the cache is beyond the requested position. 445 // back up more until we get a boundary <= the requested position. 446 while (fBoundaries[fStartBufIdx] > position) { 447 populatePreceding(status); 448 } 449 fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer. 450 fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra boundaries. 451 while (fTextIdx < position) { // Move forwards to a position at or following the requested pos. 452 next(); 453 } 454 if (fTextIdx > position) { 455 // If position is not itself a boundary, the next() loop above will overshoot. 456 // Back up one, leaving cache position at the boundary preceding the requested position. 457 previous(status); 458 } 459 return true; 460 } 461 462 U_ASSERT(fTextIdx == position); 463 return true; 464 } 465 466 467 468 UBool RuleBasedBreakIterator::BreakCache::populateFollowing() { 469 int32_t fromPosition = fBoundaries[fEndBufIdx]; 470 int32_t fromRuleStatusIdx = fStatuses[fEndBufIdx]; 471 int32_t pos = 0; 472 int32_t ruleStatusIdx = 0; 473 474 if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) { 475 addFollowing(pos, ruleStatusIdx, UpdateCachePosition); 476 return true; 477 } 478 479 fBI->fPosition = fromPosition; 480 pos = fBI->handleNext(); 481 if (pos == UBRK_DONE) { 482 return false; 483 } 484 485 ruleStatusIdx = fBI->fRuleStatusIndex; 486 if (fBI->fDictionaryCharCount > 0) { 487 // The text segment obtained from the rules includes dictionary characters. 488 // Subdivide it, with subdivided results going into the dictionary cache. 489 fBI->fDictionaryCache->populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx); 490 if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) { 491 addFollowing(pos, ruleStatusIdx, UpdateCachePosition); 492 return true; 493 // TODO: may want to move a sizable chunk of dictionary cache to break cache at this point. 494 // But be careful with interactions with populateNear(). 495 } 496 } 497 498 // Rule based segment did not include dictionary characters. 499 // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them, 500 // meaning that we didn't take the return, above. 501 // Add its end point to the cache. 502 addFollowing(pos, ruleStatusIdx, UpdateCachePosition); 503 504 // Add several non-dictionary boundaries at this point, to optimize straight forward iteration. 505 // (subsequent calls to BreakIterator::next() will take the fast path, getting cached results. 506 // 507 for (int count=0; count<6; ++count) { 508 pos = fBI->handleNext(); 509 if (pos == UBRK_DONE || fBI->fDictionaryCharCount > 0) { 510 break; 511 } 512 addFollowing(pos, fBI->fRuleStatusIndex, RetainCachePosition); 513 } 514 515 return true; 516 } 517 518 519 UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status) { 520 if (U_FAILURE(status)) { 521 return false; 522 } 523 524 int32_t fromPosition = fBoundaries[fStartBufIdx]; 525 if (fromPosition == 0) { 526 return false; 527 } 528 529 int32_t position = 0; 530 int32_t positionStatusIdx = 0; 531 532 if (fBI->fDictionaryCache->preceding(fromPosition, &position, &positionStatusIdx)) { 533 addPreceding(position, positionStatusIdx, UpdateCachePosition); 534 return true; 535 } 536 537 int32_t backupPosition = fromPosition; 538 539 // Find a boundary somewhere preceding the first already-cached boundary 540 do { 541 backupPosition = backupPosition - 30; 542 if (backupPosition <= 0) { 543 backupPosition = 0; 544 } else { 545 backupPosition = fBI->handleSafePrevious(backupPosition); 546 } 547 if (backupPosition == UBRK_DONE || backupPosition == 0) { 548 position = 0; 549 positionStatusIdx = 0; 550 } else { 551 // Advance to the boundary following the backup position. 552 // There is a complication: the safe reverse rules identify pairs of code points 553 // that are safe. If advancing from the safe point moves forwards by less than 554 // two code points, we need to advance one more time to ensure that the boundary 555 // is good, including a correct rules status value. 556 // 557 fBI->fPosition = backupPosition; 558 position = fBI->handleNext(); 559 if (position <= backupPosition + 4) { 560 // +4 is a quick test for possibly having advanced only one codepoint. 561 // Four being the length of the longest potential code point, a supplementary in UTF-8 562 utext_setNativeIndex(&fBI->fText, position); 563 if (backupPosition == utext_getPreviousNativeIndex(&fBI->fText)) { 564 // The initial handleNext() only advanced by a single code point. Go again. 565 position = fBI->handleNext(); // Safe rules identify safe pairs. 566 } 567 } 568 positionStatusIdx = fBI->fRuleStatusIndex; 569 } 570 } while (position >= fromPosition); 571 572 // Find boundaries between the one we just located and the first already-cached boundary 573 // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer.. 574 575 fSideBuffer.removeAllElements(); 576 fSideBuffer.addElement(position, status); 577 fSideBuffer.addElement(positionStatusIdx, status); 578 579 do { 580 int32_t prevPosition = fBI->fPosition = position; 581 int32_t prevStatusIdx = positionStatusIdx; 582 position = fBI->handleNext(); 583 positionStatusIdx = fBI->fRuleStatusIndex; 584 if (position == UBRK_DONE) { 585 break; 586 } 587 588 UBool segmentHandledByDictionary = false; 589 if (fBI->fDictionaryCharCount != 0) { 590 // Segment from the rules includes dictionary characters. 591 // Subdivide it, with subdivided results going into the dictionary cache. 592 int32_t dictSegEndPosition = position; 593 fBI->fDictionaryCache->populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx); 594 while (fBI->fDictionaryCache->following(prevPosition, &position, &positionStatusIdx)) { 595 segmentHandledByDictionary = true; 596 U_ASSERT(position > prevPosition); 597 if (position >= fromPosition) { 598 break; 599 } 600 U_ASSERT(position <= dictSegEndPosition); 601 fSideBuffer.addElement(position, status); 602 fSideBuffer.addElement(positionStatusIdx, status); 603 prevPosition = position; 604 } 605 U_ASSERT(position==dictSegEndPosition || position>=fromPosition); 606 } 607 608 if (!segmentHandledByDictionary && position < fromPosition) { 609 fSideBuffer.addElement(position, status); 610 fSideBuffer.addElement(positionStatusIdx, status); 611 } 612 } while (position < fromPosition); 613 614 // Move boundaries from the side buffer to the main circular buffer. 615 UBool success = false; 616 if (!fSideBuffer.isEmpty()) { 617 positionStatusIdx = fSideBuffer.popi(); 618 position = fSideBuffer.popi(); 619 addPreceding(position, positionStatusIdx, UpdateCachePosition); 620 success = true; 621 } 622 623 while (!fSideBuffer.isEmpty()) { 624 positionStatusIdx = fSideBuffer.popi(); 625 position = fSideBuffer.popi(); 626 if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) { 627 // No space in circular buffer to hold a new preceding result while 628 // also retaining the current cache (iteration) position. 629 // Bailing out is safe; the cache will refill again if needed. 630 break; 631 } 632 } 633 634 return success; 635 } 636 637 638 void RuleBasedBreakIterator::BreakCache::addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) { 639 U_ASSERT(position > fBoundaries[fEndBufIdx]); 640 U_ASSERT(ruleStatusIdx <= UINT16_MAX); 641 int32_t nextIdx = modChunkSize(fEndBufIdx + 1); 642 if (nextIdx == fStartBufIdx) { 643 fStartBufIdx = modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1. 644 } 645 fBoundaries[nextIdx] = position; 646 fStatuses[nextIdx] = static_cast<uint16_t>(ruleStatusIdx); 647 fEndBufIdx = nextIdx; 648 if (update == UpdateCachePosition) { 649 // Set current position to the newly added boundary. 650 fBufIdx = nextIdx; 651 fTextIdx = position; 652 } else { 653 // Retaining the original cache position. 654 // Check if the added boundary wraps around the buffer, and would over-write the original position. 655 // It's the responsibility of callers of this function to not add too many. 656 U_ASSERT(nextIdx != fBufIdx); 657 } 658 } 659 660 bool RuleBasedBreakIterator::BreakCache::addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) { 661 U_ASSERT(position < fBoundaries[fStartBufIdx]); 662 U_ASSERT(ruleStatusIdx <= UINT16_MAX); 663 int32_t nextIdx = modChunkSize(fStartBufIdx - 1); 664 if (nextIdx == fEndBufIdx) { 665 if (fBufIdx == fEndBufIdx && update == RetainCachePosition) { 666 // Failure. The insertion of the new boundary would claim the buffer position that is the 667 // current iteration position. And we also want to retain the current iteration position. 668 // (The buffer is already completely full of entries that precede the iteration position.) 669 return false; 670 } 671 fEndBufIdx = modChunkSize(fEndBufIdx - 1); 672 } 673 fBoundaries[nextIdx] = position; 674 fStatuses[nextIdx] = static_cast<uint16_t>(ruleStatusIdx); 675 fStartBufIdx = nextIdx; 676 if (update == UpdateCachePosition) { 677 fBufIdx = nextIdx; 678 fTextIdx = position; 679 } 680 return true; 681 } 682 683 684 void RuleBasedBreakIterator::BreakCache::dumpCache() { 685 #ifdef RBBI_DEBUG 686 RBBIDebugPrintf("fTextIdx:%d fBufIdx:%d\n", fTextIdx, fBufIdx); 687 for (int32_t i=fStartBufIdx; ; i=modChunkSize(i+1)) { 688 RBBIDebugPrintf("%d %d\n", i, fBoundaries[i]); 689 if (i == fEndBufIdx) { 690 break; 691 } 692 } 693 #endif 694 } 695 696 U_NAMESPACE_END 697 698 #endif // #if !UCONFIG_NO_BREAK_ITERATION