edits.cpp (29014B)
1 // © 2017 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 4 // edits.cpp 5 // created: 2017feb08 Markus W. Scherer 6 7 #include "unicode/edits.h" 8 #include "unicode/unistr.h" 9 #include "unicode/utypes.h" 10 #include "cmemory.h" 11 #include "uassert.h" 12 #include "util.h" 13 14 U_NAMESPACE_BEGIN 15 16 namespace { 17 18 // 0000uuuuuuuuuuuu records u+1 unchanged text units. 19 const int32_t MAX_UNCHANGED_LENGTH = 0x1000; 20 const int32_t MAX_UNCHANGED = MAX_UNCHANGED_LENGTH - 1; 21 22 // 0mmmnnnccccccccc with m=1..6 records ccc+1 replacements of m:n text units. 23 const int32_t MAX_SHORT_CHANGE_OLD_LENGTH = 6; 24 const int32_t MAX_SHORT_CHANGE_NEW_LENGTH = 7; 25 const int32_t SHORT_CHANGE_NUM_MASK = 0x1ff; 26 const int32_t MAX_SHORT_CHANGE = 0x6fff; 27 28 // 0111mmmmmmnnnnnn records a replacement of m text units with n. 29 // m or n = 61: actual length follows in the next edits array unit. 30 // m or n = 62..63: actual length follows in the next two edits array units. 31 // Bit 30 of the actual length is in the head unit. 32 // Trailing units have bit 15 set. 33 const int32_t LENGTH_IN_1TRAIL = 61; 34 const int32_t LENGTH_IN_2TRAIL = 62; 35 36 } // namespace 37 38 void Edits::releaseArray() noexcept { 39 if (array != stackArray) { 40 uprv_free(array); 41 } 42 } 43 44 Edits &Edits::copyArray(const Edits &other) { 45 if (U_FAILURE(errorCode_)) { 46 length = delta = numChanges = 0; 47 return *this; 48 } 49 if (length > capacity) { 50 uint16_t* newArray = static_cast<uint16_t*>(uprv_malloc(static_cast<size_t>(length) * 2)); 51 if (newArray == nullptr) { 52 length = delta = numChanges = 0; 53 errorCode_ = U_MEMORY_ALLOCATION_ERROR; 54 return *this; 55 } 56 releaseArray(); 57 array = newArray; 58 capacity = length; 59 } 60 if (length > 0) { 61 uprv_memcpy(array, other.array, (size_t)length * 2); 62 } 63 return *this; 64 } 65 66 Edits &Edits::moveArray(Edits &src) noexcept { 67 if (U_FAILURE(errorCode_)) { 68 length = delta = numChanges = 0; 69 return *this; 70 } 71 releaseArray(); 72 if (length > STACK_CAPACITY) { 73 array = src.array; 74 capacity = src.capacity; 75 src.array = src.stackArray; 76 src.capacity = STACK_CAPACITY; 77 src.reset(); 78 return *this; 79 } 80 array = stackArray; 81 capacity = STACK_CAPACITY; 82 if (length > 0) { 83 uprv_memcpy(array, src.array, (size_t)length * 2); 84 } 85 return *this; 86 } 87 88 Edits &Edits::operator=(const Edits &other) { 89 if (this == &other) { return *this; } // self-assignment: no-op 90 length = other.length; 91 delta = other.delta; 92 numChanges = other.numChanges; 93 errorCode_ = other.errorCode_; 94 return copyArray(other); 95 } 96 97 Edits &Edits::operator=(Edits &&src) noexcept { 98 length = src.length; 99 delta = src.delta; 100 numChanges = src.numChanges; 101 errorCode_ = src.errorCode_; 102 return moveArray(src); 103 } 104 105 Edits::~Edits() { 106 releaseArray(); 107 } 108 109 void Edits::reset() noexcept { 110 length = delta = numChanges = 0; 111 errorCode_ = U_ZERO_ERROR; 112 } 113 114 void Edits::addUnchanged(int32_t unchangedLength) { 115 if(U_FAILURE(errorCode_) || unchangedLength == 0) { return; } 116 if(unchangedLength < 0) { 117 errorCode_ = U_ILLEGAL_ARGUMENT_ERROR; 118 return; 119 } 120 // Merge into previous unchanged-text record, if any. 121 int32_t last = lastUnit(); 122 if(last < MAX_UNCHANGED) { 123 int32_t remaining = MAX_UNCHANGED - last; 124 if (remaining >= unchangedLength) { 125 setLastUnit(last + unchangedLength); 126 return; 127 } 128 setLastUnit(MAX_UNCHANGED); 129 unchangedLength -= remaining; 130 } 131 // Split large lengths into multiple units. 132 while(unchangedLength >= MAX_UNCHANGED_LENGTH) { 133 append(MAX_UNCHANGED); 134 unchangedLength -= MAX_UNCHANGED_LENGTH; 135 } 136 // Write a small (remaining) length. 137 if(unchangedLength > 0) { 138 append(unchangedLength - 1); 139 } 140 } 141 142 void Edits::addReplace(int32_t oldLength, int32_t newLength) { 143 if(U_FAILURE(errorCode_)) { return; } 144 if(oldLength < 0 || newLength < 0) { 145 errorCode_ = U_ILLEGAL_ARGUMENT_ERROR; 146 return; 147 } 148 if (oldLength == 0 && newLength == 0) { 149 return; 150 } 151 ++numChanges; 152 int32_t newDelta = newLength - oldLength; 153 if (newDelta != 0) { 154 if ((newDelta > 0 && delta >= 0 && newDelta > (INT32_MAX - delta)) || 155 (newDelta < 0 && delta < 0 && newDelta < (INT32_MIN - delta))) { 156 // Integer overflow or underflow. 157 errorCode_ = U_INDEX_OUTOFBOUNDS_ERROR; 158 return; 159 } 160 delta += newDelta; 161 } 162 163 if(0 < oldLength && oldLength <= MAX_SHORT_CHANGE_OLD_LENGTH && 164 newLength <= MAX_SHORT_CHANGE_NEW_LENGTH) { 165 // Merge into previous same-lengths short-replacement record, if any. 166 int32_t u = (oldLength << 12) | (newLength << 9); 167 int32_t last = lastUnit(); 168 if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE && 169 (last & ~SHORT_CHANGE_NUM_MASK) == u && 170 (last & SHORT_CHANGE_NUM_MASK) < SHORT_CHANGE_NUM_MASK) { 171 setLastUnit(last + 1); 172 return; 173 } 174 append(u); 175 return; 176 } 177 178 int32_t head = 0x7000; 179 if (oldLength < LENGTH_IN_1TRAIL && newLength < LENGTH_IN_1TRAIL) { 180 head |= oldLength << 6; 181 head |= newLength; 182 append(head); 183 } else if ((capacity - length) >= 5 || growArray()) { 184 int32_t limit = length + 1; 185 if(oldLength < LENGTH_IN_1TRAIL) { 186 head |= oldLength << 6; 187 } else if(oldLength <= 0x7fff) { 188 head |= LENGTH_IN_1TRAIL << 6; 189 array[limit++] = static_cast<uint16_t>(0x8000 | oldLength); 190 } else { 191 head |= (LENGTH_IN_2TRAIL + (oldLength >> 30)) << 6; 192 array[limit++] = static_cast<uint16_t>(0x8000 | (oldLength >> 15)); 193 array[limit++] = static_cast<uint16_t>(0x8000 | oldLength); 194 } 195 if(newLength < LENGTH_IN_1TRAIL) { 196 head |= newLength; 197 } else if(newLength <= 0x7fff) { 198 head |= LENGTH_IN_1TRAIL; 199 array[limit++] = static_cast<uint16_t>(0x8000 | newLength); 200 } else { 201 head |= LENGTH_IN_2TRAIL + (newLength >> 30); 202 array[limit++] = static_cast<uint16_t>(0x8000 | (newLength >> 15)); 203 array[limit++] = static_cast<uint16_t>(0x8000 | newLength); 204 } 205 array[length] = static_cast<uint16_t>(head); 206 length = limit; 207 } 208 } 209 210 void Edits::append(int32_t r) { 211 if(length < capacity || growArray()) { 212 array[length++] = static_cast<uint16_t>(r); 213 } 214 } 215 216 UBool Edits::growArray() { 217 int32_t newCapacity; 218 if (array == stackArray) { 219 newCapacity = 2000; 220 } else if (capacity == INT32_MAX) { 221 // Not U_BUFFER_OVERFLOW_ERROR because that could be confused on a string transform API 222 // with a result-string-buffer overflow. 223 errorCode_ = U_INDEX_OUTOFBOUNDS_ERROR; 224 return false; 225 } else if (capacity >= (INT32_MAX / 2)) { 226 newCapacity = INT32_MAX; 227 } else { 228 newCapacity = 2 * capacity; 229 } 230 // Grow by at least 5 units so that a maximal change record will fit. 231 if ((newCapacity - capacity) < 5) { 232 errorCode_ = U_INDEX_OUTOFBOUNDS_ERROR; 233 return false; 234 } 235 uint16_t* newArray = static_cast<uint16_t*>(uprv_malloc(static_cast<size_t>(newCapacity) * 2)); 236 if (newArray == nullptr) { 237 errorCode_ = U_MEMORY_ALLOCATION_ERROR; 238 return false; 239 } 240 uprv_memcpy(newArray, array, (size_t)length * 2); 241 releaseArray(); 242 array = newArray; 243 capacity = newCapacity; 244 return true; 245 } 246 247 UBool Edits::copyErrorTo(UErrorCode &outErrorCode) const { 248 if (U_FAILURE(outErrorCode)) { return true; } 249 if (U_SUCCESS(errorCode_)) { return false; } 250 outErrorCode = errorCode_; 251 return true; 252 } 253 254 Edits &Edits::mergeAndAppend(const Edits &ab, const Edits &bc, UErrorCode &errorCode) { 255 if (copyErrorTo(errorCode)) { return *this; } 256 // Picture string a --(Edits ab)--> string b --(Edits bc)--> string c. 257 // Parallel iteration over both Edits. 258 Iterator abIter = ab.getFineIterator(); 259 Iterator bcIter = bc.getFineIterator(); 260 UBool abHasNext = true, bcHasNext = true; 261 // Copy iterator state into local variables, so that we can modify and subdivide spans. 262 // ab old & new length, bc old & new length 263 int32_t aLength = 0, ab_bLength = 0, bc_bLength = 0, cLength = 0; 264 // When we have different-intermediate-length changes, we accumulate a larger change. 265 int32_t pending_aLength = 0, pending_cLength = 0; 266 for (;;) { 267 // At this point, for each of the two iterators: 268 // Either we are done with the locally cached current edit, 269 // and its intermediate-string length has been reset, 270 // or we will continue to work with a truncated remainder of this edit. 271 // 272 // If the current edit is done, and the iterator has not yet reached the end, 273 // then we fetch the next edit. This is true for at least one of the iterators. 274 // 275 // Normally it does not matter whether we fetch from ab and then bc or vice versa. 276 // However, the result is observably different when 277 // ab deletions meet bc insertions at the same intermediate-string index. 278 // Some users expect the bc insertions to come first, so we fetch from bc first. 279 if (bc_bLength == 0) { 280 if (bcHasNext && (bcHasNext = bcIter.next(errorCode)) != 0) { 281 bc_bLength = bcIter.oldLength(); 282 cLength = bcIter.newLength(); 283 if (bc_bLength == 0) { 284 // insertion 285 if (ab_bLength == 0 || !abIter.hasChange()) { 286 addReplace(pending_aLength, pending_cLength + cLength); 287 pending_aLength = pending_cLength = 0; 288 } else { 289 pending_cLength += cLength; 290 } 291 continue; 292 } 293 } 294 // else see if the other iterator is done, too. 295 } 296 if (ab_bLength == 0) { 297 if (abHasNext && (abHasNext = abIter.next(errorCode)) != 0) { 298 aLength = abIter.oldLength(); 299 ab_bLength = abIter.newLength(); 300 if (ab_bLength == 0) { 301 // deletion 302 if (bc_bLength == bcIter.oldLength() || !bcIter.hasChange()) { 303 addReplace(pending_aLength + aLength, pending_cLength); 304 pending_aLength = pending_cLength = 0; 305 } else { 306 pending_aLength += aLength; 307 } 308 continue; 309 } 310 } else if (bc_bLength == 0) { 311 // Both iterators are done at the same time: 312 // The intermediate-string lengths match. 313 break; 314 } else { 315 // The ab output string is shorter than the bc input string. 316 if (!copyErrorTo(errorCode)) { 317 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 318 } 319 return *this; 320 } 321 } 322 if (bc_bLength == 0) { 323 // The bc input string is shorter than the ab output string. 324 if (!copyErrorTo(errorCode)) { 325 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 326 } 327 return *this; 328 } 329 // Done fetching: ab_bLength > 0 && bc_bLength > 0 330 331 // The current state has two parts: 332 // - Past: We accumulate a longer ac edit in the "pending" variables. 333 // - Current: We have copies of the current ab/bc edits in local variables. 334 // At least one side is newly fetched. 335 // One side might be a truncated remainder of an edit we fetched earlier. 336 337 if (!abIter.hasChange() && !bcIter.hasChange()) { 338 // An unchanged span all the way from string a to string c. 339 if (pending_aLength != 0 || pending_cLength != 0) { 340 addReplace(pending_aLength, pending_cLength); 341 pending_aLength = pending_cLength = 0; 342 } 343 int32_t unchangedLength = aLength <= cLength ? aLength : cLength; 344 addUnchanged(unchangedLength); 345 ab_bLength = aLength -= unchangedLength; 346 bc_bLength = cLength -= unchangedLength; 347 // At least one of the unchanged spans is now empty. 348 continue; 349 } 350 if (!abIter.hasChange() && bcIter.hasChange()) { 351 // Unchanged a->b but changed b->c. 352 if (ab_bLength >= bc_bLength) { 353 // Split the longer unchanged span into change + remainder. 354 addReplace(pending_aLength + bc_bLength, pending_cLength + cLength); 355 pending_aLength = pending_cLength = 0; 356 aLength = ab_bLength -= bc_bLength; 357 bc_bLength = 0; 358 continue; 359 } 360 // Handle the shorter unchanged span below like a change. 361 } else if (abIter.hasChange() && !bcIter.hasChange()) { 362 // Changed a->b and then unchanged b->c. 363 if (ab_bLength <= bc_bLength) { 364 // Split the longer unchanged span into change + remainder. 365 addReplace(pending_aLength + aLength, pending_cLength + ab_bLength); 366 pending_aLength = pending_cLength = 0; 367 cLength = bc_bLength -= ab_bLength; 368 ab_bLength = 0; 369 continue; 370 } 371 // Handle the shorter unchanged span below like a change. 372 } else { // both abIter.hasChange() && bcIter.hasChange() 373 if (ab_bLength == bc_bLength) { 374 // Changes on both sides up to the same position. Emit & reset. 375 addReplace(pending_aLength + aLength, pending_cLength + cLength); 376 pending_aLength = pending_cLength = 0; 377 ab_bLength = bc_bLength = 0; 378 continue; 379 } 380 } 381 // Accumulate the a->c change, reset the shorter side, 382 // keep a remainder of the longer one. 383 pending_aLength += aLength; 384 pending_cLength += cLength; 385 if (ab_bLength < bc_bLength) { 386 bc_bLength -= ab_bLength; 387 cLength = ab_bLength = 0; 388 } else { // ab_bLength > bc_bLength 389 ab_bLength -= bc_bLength; 390 aLength = bc_bLength = 0; 391 } 392 } 393 if (pending_aLength != 0 || pending_cLength != 0) { 394 addReplace(pending_aLength, pending_cLength); 395 } 396 copyErrorTo(errorCode); 397 return *this; 398 } 399 400 Edits::Iterator::Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs) : 401 array(a), index(0), length(len), remaining(0), 402 onlyChanges_(oc), coarse(crs), 403 dir(0), changed(false), oldLength_(0), newLength_(0), 404 srcIndex(0), replIndex(0), destIndex(0) {} 405 406 int32_t Edits::Iterator::readLength(int32_t head) { 407 if (head < LENGTH_IN_1TRAIL) { 408 return head; 409 } else if (head < LENGTH_IN_2TRAIL) { 410 U_ASSERT(index < length); 411 U_ASSERT(array[index] >= 0x8000); 412 return array[index++] & 0x7fff; 413 } else { 414 U_ASSERT((index + 2) <= length); 415 U_ASSERT(array[index] >= 0x8000); 416 U_ASSERT(array[index + 1] >= 0x8000); 417 int32_t len = ((head & 1) << 30) | 418 (static_cast<int32_t>(array[index] & 0x7fff) << 15) | 419 (array[index + 1] & 0x7fff); 420 index += 2; 421 return len; 422 } 423 } 424 425 void Edits::Iterator::updateNextIndexes() { 426 srcIndex += oldLength_; 427 if (changed) { 428 replIndex += newLength_; 429 } 430 destIndex += newLength_; 431 } 432 433 void Edits::Iterator::updatePreviousIndexes() { 434 srcIndex -= oldLength_; 435 if (changed) { 436 replIndex -= newLength_; 437 } 438 destIndex -= newLength_; 439 } 440 441 UBool Edits::Iterator::noNext() { 442 // No change before or beyond the string. 443 dir = 0; 444 changed = false; 445 oldLength_ = newLength_ = 0; 446 return false; 447 } 448 449 UBool Edits::Iterator::next(UBool onlyChanges, UErrorCode &errorCode) { 450 // Forward iteration: Update the string indexes to the limit of the current span, 451 // and post-increment-read array units to assemble a new span. 452 // Leaves the array index one after the last unit of that span. 453 if (U_FAILURE(errorCode)) { return false; } 454 // We have an errorCode in case we need to start guarding against integer overflows. 455 // It is also convenient for caller loops if we bail out when an error was set elsewhere. 456 if (dir > 0) { 457 updateNextIndexes(); 458 } else { 459 if (dir < 0) { 460 // Turn around from previous() to next(). 461 // Post-increment-read the same span again. 462 if (remaining > 0) { 463 // Fine-grained iterator: 464 // Stay on the current one of a sequence of compressed changes. 465 ++index; // next() rests on the index after the sequence unit. 466 dir = 1; 467 return true; 468 } 469 } 470 dir = 1; 471 } 472 if (remaining >= 1) { 473 // Fine-grained iterator: Continue a sequence of compressed changes. 474 if (remaining > 1) { 475 --remaining; 476 return true; 477 } 478 remaining = 0; 479 } 480 if (index >= length) { 481 return noNext(); 482 } 483 int32_t u = array[index++]; 484 if (u <= MAX_UNCHANGED) { 485 // Combine adjacent unchanged ranges. 486 changed = false; 487 oldLength_ = u + 1; 488 while (index < length && (u = array[index]) <= MAX_UNCHANGED) { 489 ++index; 490 oldLength_ += u + 1; 491 } 492 newLength_ = oldLength_; 493 if (onlyChanges) { 494 updateNextIndexes(); 495 if (index >= length) { 496 return noNext(); 497 } 498 // already fetched u > MAX_UNCHANGED at index 499 ++index; 500 } else { 501 return true; 502 } 503 } 504 changed = true; 505 if (u <= MAX_SHORT_CHANGE) { 506 int32_t oldLen = u >> 12; 507 int32_t newLen = (u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH; 508 int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1; 509 if (coarse) { 510 oldLength_ = num * oldLen; 511 newLength_ = num * newLen; 512 } else { 513 // Split a sequence of changes that was compressed into one unit. 514 oldLength_ = oldLen; 515 newLength_ = newLen; 516 if (num > 1) { 517 remaining = num; // This is the first of two or more changes. 518 } 519 return true; 520 } 521 } else { 522 U_ASSERT(u <= 0x7fff); 523 oldLength_ = readLength((u >> 6) & 0x3f); 524 newLength_ = readLength(u & 0x3f); 525 if (!coarse) { 526 return true; 527 } 528 } 529 // Combine adjacent changes. 530 while (index < length && (u = array[index]) > MAX_UNCHANGED) { 531 ++index; 532 if (u <= MAX_SHORT_CHANGE) { 533 int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1; 534 oldLength_ += (u >> 12) * num; 535 newLength_ += ((u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH) * num; 536 } else { 537 U_ASSERT(u <= 0x7fff); 538 oldLength_ += readLength((u >> 6) & 0x3f); 539 newLength_ += readLength(u & 0x3f); 540 } 541 } 542 return true; 543 } 544 545 UBool Edits::Iterator::previous(UErrorCode &errorCode) { 546 // Backward iteration: Pre-decrement-read array units to assemble a new span, 547 // then update the string indexes to the start of that span. 548 // Leaves the array index on the head unit of that span. 549 if (U_FAILURE(errorCode)) { return false; } 550 // We have an errorCode in case we need to start guarding against integer overflows. 551 // It is also convenient for caller loops if we bail out when an error was set elsewhere. 552 if (dir >= 0) { 553 if (dir > 0) { 554 // Turn around from next() to previous(). 555 // Set the string indexes to the span limit and 556 // pre-decrement-read the same span again. 557 if (remaining > 0) { 558 // Fine-grained iterator: 559 // Stay on the current one of a sequence of compressed changes. 560 --index; // previous() rests on the sequence unit. 561 dir = -1; 562 return true; 563 } 564 updateNextIndexes(); 565 } 566 dir = -1; 567 } 568 if (remaining > 0) { 569 // Fine-grained iterator: Continue a sequence of compressed changes. 570 int32_t u = array[index]; 571 U_ASSERT(MAX_UNCHANGED < u && u <= MAX_SHORT_CHANGE); 572 if (remaining <= (u & SHORT_CHANGE_NUM_MASK)) { 573 ++remaining; 574 updatePreviousIndexes(); 575 return true; 576 } 577 remaining = 0; 578 } 579 if (index <= 0) { 580 return noNext(); 581 } 582 int32_t u = array[--index]; 583 if (u <= MAX_UNCHANGED) { 584 // Combine adjacent unchanged ranges. 585 changed = false; 586 oldLength_ = u + 1; 587 while (index > 0 && (u = array[index - 1]) <= MAX_UNCHANGED) { 588 --index; 589 oldLength_ += u + 1; 590 } 591 newLength_ = oldLength_; 592 // No need to handle onlyChanges as long as previous() is called only from findIndex(). 593 updatePreviousIndexes(); 594 return true; 595 } 596 changed = true; 597 if (u <= MAX_SHORT_CHANGE) { 598 int32_t oldLen = u >> 12; 599 int32_t newLen = (u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH; 600 int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1; 601 if (coarse) { 602 oldLength_ = num * oldLen; 603 newLength_ = num * newLen; 604 } else { 605 // Split a sequence of changes that was compressed into one unit. 606 oldLength_ = oldLen; 607 newLength_ = newLen; 608 if (num > 1) { 609 remaining = 1; // This is the last of two or more changes. 610 } 611 updatePreviousIndexes(); 612 return true; 613 } 614 } else { 615 if (u <= 0x7fff) { 616 // The change is encoded in u alone. 617 oldLength_ = readLength((u >> 6) & 0x3f); 618 newLength_ = readLength(u & 0x3f); 619 } else { 620 // Back up to the head of the change, read the lengths, 621 // and reset the index to the head again. 622 U_ASSERT(index > 0); 623 while ((u = array[--index]) > 0x7fff) {} 624 U_ASSERT(u > MAX_SHORT_CHANGE); 625 int32_t headIndex = index++; 626 oldLength_ = readLength((u >> 6) & 0x3f); 627 newLength_ = readLength(u & 0x3f); 628 index = headIndex; 629 } 630 if (!coarse) { 631 updatePreviousIndexes(); 632 return true; 633 } 634 } 635 // Combine adjacent changes. 636 while (index > 0 && (u = array[index - 1]) > MAX_UNCHANGED) { 637 --index; 638 if (u <= MAX_SHORT_CHANGE) { 639 int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1; 640 oldLength_ += (u >> 12) * num; 641 newLength_ += ((u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH) * num; 642 } else if (u <= 0x7fff) { 643 // Read the lengths, and reset the index to the head again. 644 int32_t headIndex = index++; 645 oldLength_ += readLength((u >> 6) & 0x3f); 646 newLength_ += readLength(u & 0x3f); 647 index = headIndex; 648 } 649 } 650 updatePreviousIndexes(); 651 return true; 652 } 653 654 int32_t Edits::Iterator::findIndex(int32_t i, UBool findSource, UErrorCode &errorCode) { 655 if (U_FAILURE(errorCode) || i < 0) { return -1; } 656 int32_t spanStart, spanLength; 657 if (findSource) { // find source index 658 spanStart = srcIndex; 659 spanLength = oldLength_; 660 } else { // find destination index 661 spanStart = destIndex; 662 spanLength = newLength_; 663 } 664 if (i < spanStart) { 665 if (i >= (spanStart / 2)) { 666 // Search backwards. 667 for (;;) { 668 UBool hasPrevious = previous(errorCode); 669 U_ASSERT(hasPrevious); // because i>=0 and the first span starts at 0 670 (void)hasPrevious; // avoid unused-variable warning 671 spanStart = findSource ? srcIndex : destIndex; 672 if (i >= spanStart) { 673 // The index is in the current span. 674 return 0; 675 } 676 if (remaining > 0) { 677 // Is the index in one of the remaining compressed edits? 678 // spanStart is the start of the current span, first of the remaining ones. 679 spanLength = findSource ? oldLength_ : newLength_; 680 int32_t u = array[index]; 681 U_ASSERT(MAX_UNCHANGED < u && u <= MAX_SHORT_CHANGE); 682 int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1 - remaining; 683 int32_t len = num * spanLength; 684 if (i >= (spanStart - len)) { 685 int32_t n = ((spanStart - i - 1) / spanLength) + 1; 686 // 1 <= n <= num 687 srcIndex -= n * oldLength_; 688 replIndex -= n * newLength_; 689 destIndex -= n * newLength_; 690 remaining += n; 691 return 0; 692 } 693 // Skip all of these edits at once. 694 srcIndex -= num * oldLength_; 695 replIndex -= num * newLength_; 696 destIndex -= num * newLength_; 697 remaining = 0; 698 } 699 } 700 } 701 // Reset the iterator to the start. 702 dir = 0; 703 index = remaining = oldLength_ = newLength_ = srcIndex = replIndex = destIndex = 0; 704 } else if (i < (spanStart + spanLength)) { 705 // The index is in the current span. 706 return 0; 707 } 708 while (next(false, errorCode)) { 709 if (findSource) { 710 spanStart = srcIndex; 711 spanLength = oldLength_; 712 } else { 713 spanStart = destIndex; 714 spanLength = newLength_; 715 } 716 if (i < (spanStart + spanLength)) { 717 // The index is in the current span. 718 return 0; 719 } 720 if (remaining > 1) { 721 // Is the index in one of the remaining compressed edits? 722 // spanStart is the start of the current span, first of the remaining ones. 723 int32_t len = remaining * spanLength; 724 if (i < (spanStart + len)) { 725 int32_t n = (i - spanStart) / spanLength; // 1 <= n <= remaining - 1 726 srcIndex += n * oldLength_; 727 replIndex += n * newLength_; 728 destIndex += n * newLength_; 729 remaining -= n; 730 return 0; 731 } 732 // Make next() skip all of these edits at once. 733 oldLength_ *= remaining; 734 newLength_ *= remaining; 735 remaining = 0; 736 } 737 } 738 return 1; 739 } 740 741 int32_t Edits::Iterator::destinationIndexFromSourceIndex(int32_t i, UErrorCode &errorCode) { 742 int32_t where = findIndex(i, true, errorCode); 743 if (where < 0) { 744 // Error or before the string. 745 return 0; 746 } 747 if (where > 0 || i == srcIndex) { 748 // At or after string length, or at start of the found span. 749 return destIndex; 750 } 751 if (changed) { 752 // In a change span, map to its end. 753 return destIndex + newLength_; 754 } else { 755 // In an unchanged span, offset 1:1 within it. 756 return destIndex + (i - srcIndex); 757 } 758 } 759 760 int32_t Edits::Iterator::sourceIndexFromDestinationIndex(int32_t i, UErrorCode &errorCode) { 761 int32_t where = findIndex(i, false, errorCode); 762 if (where < 0) { 763 // Error or before the string. 764 return 0; 765 } 766 if (where > 0 || i == destIndex) { 767 // At or after string length, or at start of the found span. 768 return srcIndex; 769 } 770 if (changed) { 771 // In a change span, map to its end. 772 return srcIndex + oldLength_; 773 } else { 774 // In an unchanged span, offset within it. 775 return srcIndex + (i - destIndex); 776 } 777 } 778 779 UnicodeString& Edits::Iterator::toString(UnicodeString& sb) const { 780 sb.append(u"{ src[", -1); 781 ICU_Utility::appendNumber(sb, srcIndex); 782 sb.append(u"..", -1); 783 ICU_Utility::appendNumber(sb, srcIndex + oldLength_); 784 if (changed) { 785 sb.append(u"] ⇝ dest[", -1); 786 } else { 787 sb.append(u"] ≡ dest[", -1); 788 } 789 ICU_Utility::appendNumber(sb, destIndex); 790 sb.append(u"..", -1); 791 ICU_Utility::appendNumber(sb, destIndex + newLength_); 792 if (changed) { 793 sb.append(u"], repl[", -1); 794 ICU_Utility::appendNumber(sb, replIndex); 795 sb.append(u"..", -1); 796 ICU_Utility::appendNumber(sb, replIndex + newLength_); 797 sb.append(u"] }", -1); 798 } else { 799 sb.append(u"] (no-change) }", -1); 800 } 801 return sb; 802 } 803 804 U_NAMESPACE_END