utext.cpp (99192B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2005-2016, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: utext.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2005apr12 16 * created by: Markus W. Scherer 17 */ 18 19 #include <cstddef> 20 21 #include "unicode/utypes.h" 22 #include "unicode/ustring.h" 23 #include "unicode/unistr.h" 24 #include "unicode/chariter.h" 25 #include "unicode/utext.h" 26 #include "unicode/utf.h" 27 #include "unicode/utf8.h" 28 #include "unicode/utf16.h" 29 #include "ustr_imp.h" 30 #include "cmemory.h" 31 #include "cstring.h" 32 #include "uassert.h" 33 #include "putilimp.h" 34 35 U_NAMESPACE_USE 36 37 #define I32_FLAG(bitIndex) ((int32_t)1<<(bitIndex)) 38 39 40 static UBool 41 utext_access(UText *ut, int64_t index, UBool forward) { 42 return ut->pFuncs->access(ut, index, forward); 43 } 44 45 46 47 U_CAPI UBool U_EXPORT2 48 utext_moveIndex32(UText *ut, int32_t delta) { 49 UChar32 c; 50 if (delta > 0) { 51 do { 52 if(ut->chunkOffset>=ut->chunkLength && !utext_access(ut, ut->chunkNativeLimit, true)) { 53 return false; 54 } 55 c = ut->chunkContents[ut->chunkOffset]; 56 if (U16_IS_SURROGATE(c)) { 57 c = utext_next32(ut); 58 if (c == U_SENTINEL) { 59 return false; 60 } 61 } else { 62 ut->chunkOffset++; 63 } 64 } while(--delta>0); 65 66 } else if (delta<0) { 67 do { 68 if(ut->chunkOffset<=0 && !utext_access(ut, ut->chunkNativeStart, false)) { 69 return false; 70 } 71 c = ut->chunkContents[ut->chunkOffset-1]; 72 if (U16_IS_SURROGATE(c)) { 73 c = utext_previous32(ut); 74 if (c == U_SENTINEL) { 75 return false; 76 } 77 } else { 78 ut->chunkOffset--; 79 } 80 } while(++delta<0); 81 } 82 83 return true; 84 } 85 86 87 U_CAPI int64_t U_EXPORT2 88 utext_nativeLength(UText *ut) { 89 return ut->pFuncs->nativeLength(ut); 90 } 91 92 93 U_CAPI UBool U_EXPORT2 94 utext_isLengthExpensive(const UText *ut) { 95 UBool r = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE)) != 0; 96 return r; 97 } 98 99 100 U_CAPI int64_t U_EXPORT2 101 utext_getNativeIndex(const UText *ut) { 102 if(ut->chunkOffset <= ut->nativeIndexingLimit) { 103 return ut->chunkNativeStart+ut->chunkOffset; 104 } else { 105 return ut->pFuncs->mapOffsetToNative(ut); 106 } 107 } 108 109 110 U_CAPI void U_EXPORT2 111 utext_setNativeIndex(UText *ut, int64_t index) { 112 if(index<ut->chunkNativeStart || index>=ut->chunkNativeLimit) { 113 // The desired position is outside of the current chunk. 114 // Access the new position. Assume a forward iteration from here, 115 // which will also be optimimum for a single random access. 116 // Reverse iterations may suffer slightly. 117 ut->pFuncs->access(ut, index, true); 118 } else if((int32_t)(index - ut->chunkNativeStart) <= ut->nativeIndexingLimit) { 119 // utf-16 indexing. 120 ut->chunkOffset=(int32_t)(index-ut->chunkNativeStart); 121 } else { 122 ut->chunkOffset=ut->pFuncs->mapNativeIndexToUTF16(ut, index); 123 } 124 // The convention is that the index must always be on a code point boundary. 125 // Adjust the index position if it is in the middle of a surrogate pair. 126 if (ut->chunkOffset<ut->chunkLength) { 127 char16_t c= ut->chunkContents[ut->chunkOffset]; 128 if (U16_IS_TRAIL(c)) { 129 if (ut->chunkOffset==0) { 130 ut->pFuncs->access(ut, ut->chunkNativeStart, false); 131 } 132 if (ut->chunkOffset>0) { 133 char16_t lead = ut->chunkContents[ut->chunkOffset-1]; 134 if (U16_IS_LEAD(lead)) { 135 ut->chunkOffset--; 136 } 137 } 138 } 139 } 140 } 141 142 143 144 U_CAPI int64_t U_EXPORT2 145 utext_getPreviousNativeIndex(UText *ut) { 146 // 147 // Fast-path the common case. 148 // Common means current position is not at the beginning of a chunk 149 // and the preceding character is not supplementary. 150 // 151 int32_t i = ut->chunkOffset - 1; 152 int64_t result; 153 if (i >= 0) { 154 char16_t c = ut->chunkContents[i]; 155 if (U16_IS_TRAIL(c) == false) { 156 if (i <= ut->nativeIndexingLimit) { 157 result = ut->chunkNativeStart + i; 158 } else { 159 ut->chunkOffset = i; 160 result = ut->pFuncs->mapOffsetToNative(ut); 161 ut->chunkOffset++; 162 } 163 return result; 164 } 165 } 166 167 // If at the start of text, simply return 0. 168 if (ut->chunkOffset==0 && ut->chunkNativeStart==0) { 169 return 0; 170 } 171 172 // Harder, less common cases. We are at a chunk boundary, or on a surrogate. 173 // Keep it simple, use other functions to handle the edges. 174 // 175 utext_previous32(ut); 176 result = UTEXT_GETNATIVEINDEX(ut); 177 utext_next32(ut); 178 return result; 179 } 180 181 182 // 183 // utext_current32. Get the UChar32 at the current position. 184 // UText iteration position is always on a code point boundary, 185 // never on the trail half of a surrogate pair. 186 // 187 U_CAPI UChar32 U_EXPORT2 188 utext_current32(UText *ut) { 189 UChar32 c; 190 if (ut->chunkOffset==ut->chunkLength) { 191 // Current position is just off the end of the chunk. 192 if (ut->pFuncs->access(ut, ut->chunkNativeLimit, true) == false) { 193 // Off the end of the text. 194 return U_SENTINEL; 195 } 196 } 197 198 c = ut->chunkContents[ut->chunkOffset]; 199 if (U16_IS_LEAD(c) == false) { 200 // Normal, non-supplementary case. 201 return c; 202 } 203 204 // 205 // Possible supplementary char. 206 // 207 UChar32 trail = 0; 208 UChar32 supplementaryC = c; 209 if ((ut->chunkOffset+1) < ut->chunkLength) { 210 // The trail surrogate is in the same chunk. 211 trail = ut->chunkContents[ut->chunkOffset+1]; 212 } else { 213 // The trail surrogate is in a different chunk. 214 // Because we must maintain the iteration position, we need to switch forward 215 // into the new chunk, get the trail surrogate, then revert the chunk back to the 216 // original one. 217 // An edge case to be careful of: the entire text may end with an unpaired 218 // leading surrogate. The attempt to access the trail will fail, but 219 // the original position before the unpaired lead still needs to be restored. 220 int64_t nativePosition = ut->chunkNativeLimit; 221 if (ut->pFuncs->access(ut, nativePosition, true)) { 222 trail = ut->chunkContents[ut->chunkOffset]; 223 } 224 UBool r = ut->pFuncs->access(ut, nativePosition, false); // reverse iteration flag loads preceding chunk 225 U_ASSERT(r); 226 // Here we need to restore chunkOffset since the access functions were called with 227 // chunkNativeLimit but that is not where we were (we were 1 code unit before the 228 // limit). Restoring was originally added in ICU-4669 but did not support access 229 // functions that changed the chunk size, the following does. 230 ut->chunkOffset = ut->chunkLength - 1; 231 if(!r) { 232 return U_SENTINEL; 233 } 234 } 235 236 if (U16_IS_TRAIL(trail)) { 237 supplementaryC = U16_GET_SUPPLEMENTARY(c, trail); 238 } 239 return supplementaryC; 240 241 } 242 243 244 U_CAPI UChar32 U_EXPORT2 245 utext_char32At(UText *ut, int64_t nativeIndex) { 246 UChar32 c = U_SENTINEL; 247 248 // Fast path the common case. 249 if (nativeIndex>=ut->chunkNativeStart && nativeIndex < ut->chunkNativeStart + ut->nativeIndexingLimit) { 250 ut->chunkOffset = (int32_t)(nativeIndex - ut->chunkNativeStart); 251 c = ut->chunkContents[ut->chunkOffset]; 252 if (U16_IS_SURROGATE(c) == false) { 253 return c; 254 } 255 } 256 257 258 utext_setNativeIndex(ut, nativeIndex); 259 if (nativeIndex>=ut->chunkNativeStart && ut->chunkOffset<ut->chunkLength) { 260 c = ut->chunkContents[ut->chunkOffset]; 261 if (U16_IS_SURROGATE(c)) { 262 // For surrogates, let current32() deal with the complications 263 // of supplementaries that may span chunk boundaries. 264 c = utext_current32(ut); 265 } 266 } 267 return c; 268 } 269 270 271 U_CAPI UChar32 U_EXPORT2 272 utext_next32(UText *ut) { 273 UChar32 c; 274 275 if (ut->chunkOffset >= ut->chunkLength) { 276 if (ut->pFuncs->access(ut, ut->chunkNativeLimit, true) == false) { 277 return U_SENTINEL; 278 } 279 } 280 281 c = ut->chunkContents[ut->chunkOffset++]; 282 if (U16_IS_LEAD(c) == false) { 283 // Normal case, not supplementary. 284 // (A trail surrogate seen here is just returned as is, as a surrogate value. 285 // It cannot be part of a pair.) 286 return c; 287 } 288 289 if (ut->chunkOffset >= ut->chunkLength) { 290 if (ut->pFuncs->access(ut, ut->chunkNativeLimit, true) == false) { 291 // c is an unpaired lead surrogate at the end of the text. 292 // return it as it is. 293 return c; 294 } 295 } 296 UChar32 trail = ut->chunkContents[ut->chunkOffset]; 297 if (U16_IS_TRAIL(trail) == false) { 298 // c was an unpaired lead surrogate, not at the end of the text. 299 // return it as it is (unpaired). Iteration position is on the 300 // following character, possibly in the next chunk, where the 301 // trail surrogate would have been if it had existed. 302 return c; 303 } 304 305 UChar32 supplementary = U16_GET_SUPPLEMENTARY(c, trail); 306 ut->chunkOffset++; // move iteration position over the trail surrogate. 307 return supplementary; 308 } 309 310 311 U_CAPI UChar32 U_EXPORT2 312 utext_previous32(UText *ut) { 313 UChar32 c; 314 315 if (ut->chunkOffset <= 0) { 316 if (ut->pFuncs->access(ut, ut->chunkNativeStart, false) == false) { 317 return U_SENTINEL; 318 } 319 } 320 ut->chunkOffset--; 321 c = ut->chunkContents[ut->chunkOffset]; 322 if (U16_IS_TRAIL(c) == false) { 323 // Normal case, not supplementary. 324 // (A lead surrogate seen here is just returned as is, as a surrogate value. 325 // It cannot be part of a pair.) 326 return c; 327 } 328 329 if (ut->chunkOffset <= 0) { 330 if (ut->pFuncs->access(ut, ut->chunkNativeStart, false) == false) { 331 // c is an unpaired trail surrogate at the start of the text. 332 // return it as it is. 333 return c; 334 } 335 } 336 337 UChar32 lead = ut->chunkContents[ut->chunkOffset-1]; 338 if (U16_IS_LEAD(lead) == false) { 339 // c was an unpaired trail surrogate, not at the end of the text. 340 // return it as it is (unpaired). Iteration position is at c 341 return c; 342 } 343 344 UChar32 supplementary = U16_GET_SUPPLEMENTARY(lead, c); 345 ut->chunkOffset--; // move iteration position over the lead surrogate. 346 return supplementary; 347 } 348 349 350 351 U_CAPI UChar32 U_EXPORT2 352 utext_next32From(UText *ut, int64_t index) { 353 UChar32 c = U_SENTINEL; 354 355 if(index<ut->chunkNativeStart || index>=ut->chunkNativeLimit) { 356 // Desired position is outside of the current chunk. 357 if(!ut->pFuncs->access(ut, index, true)) { 358 // no chunk available here 359 return U_SENTINEL; 360 } 361 } else if (index - ut->chunkNativeStart <= (int64_t)ut->nativeIndexingLimit) { 362 // Desired position is in chunk, with direct 1:1 native to UTF16 indexing 363 ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart); 364 } else { 365 // Desired position is in chunk, with non-UTF16 indexing. 366 ut->chunkOffset = ut->pFuncs->mapNativeIndexToUTF16(ut, index); 367 } 368 369 c = ut->chunkContents[ut->chunkOffset++]; 370 if (U16_IS_SURROGATE(c)) { 371 // Surrogates. Many edge cases. Use other functions that already 372 // deal with the problems. 373 utext_setNativeIndex(ut, index); 374 c = utext_next32(ut); 375 } 376 return c; 377 } 378 379 380 U_CAPI UChar32 U_EXPORT2 381 utext_previous32From(UText *ut, int64_t index) { 382 // 383 // Return the character preceding the specified index. 384 // Leave the iteration position at the start of the character that was returned. 385 // 386 UChar32 cPrev; // The character preceding cCurr, which is what we will return. 387 388 // Address the chunk containing the position preceding the incoming index 389 // A tricky edge case: 390 // We try to test the requested native index against the chunkNativeStart to determine 391 // whether the character preceding the one at the index is in the current chunk. 392 // BUT, this test can fail with UTF-8 (or any other multibyte encoding), when the 393 // requested index is on something other than the first position of the first char. 394 // 395 if(index<=ut->chunkNativeStart || index>ut->chunkNativeLimit) { 396 // Requested native index is outside of the current chunk. 397 if(!ut->pFuncs->access(ut, index, false)) { 398 // no chunk available here 399 return U_SENTINEL; 400 } 401 } else if(index - ut->chunkNativeStart <= (int64_t)ut->nativeIndexingLimit) { 402 // Direct UTF-16 indexing. 403 ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart); 404 } else { 405 ut->chunkOffset=ut->pFuncs->mapNativeIndexToUTF16(ut, index); 406 if (ut->chunkOffset==0 && !ut->pFuncs->access(ut, index, false)) { 407 // no chunk available here 408 return U_SENTINEL; 409 } 410 } 411 412 // 413 // Simple case with no surrogates. 414 // 415 ut->chunkOffset--; 416 cPrev = ut->chunkContents[ut->chunkOffset]; 417 418 if (U16_IS_SURROGATE(cPrev)) { 419 // Possible supplementary. Many edge cases. 420 // Let other functions do the heavy lifting. 421 utext_setNativeIndex(ut, index); 422 cPrev = utext_previous32(ut); 423 } 424 return cPrev; 425 } 426 427 428 U_CAPI int32_t U_EXPORT2 429 utext_extract(UText *ut, 430 int64_t start, int64_t limit, 431 char16_t *dest, int32_t destCapacity, 432 UErrorCode *status) { 433 return ut->pFuncs->extract(ut, start, limit, dest, destCapacity, status); 434 } 435 436 437 438 U_CAPI UBool U_EXPORT2 439 utext_equals(const UText *a, const UText *b) { 440 if (a==nullptr || b==nullptr || 441 a->magic != UTEXT_MAGIC || 442 b->magic != UTEXT_MAGIC) { 443 // Null or invalid arguments don't compare equal to anything. 444 return false; 445 } 446 447 if (a->pFuncs != b->pFuncs) { 448 // Different types of text providers. 449 return false; 450 } 451 452 if (a->context != b->context) { 453 // Different sources (different strings) 454 return false; 455 } 456 if (utext_getNativeIndex(a) != utext_getNativeIndex(b)) { 457 // Different current position in the string. 458 return false; 459 } 460 461 return true; 462 } 463 464 U_CAPI UBool U_EXPORT2 465 utext_isWritable(const UText *ut) 466 { 467 UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) != 0; 468 return b; 469 } 470 471 472 U_CAPI void U_EXPORT2 473 utext_freeze(UText *ut) { 474 // Zero out the WRITABLE flag. 475 ut->providerProperties &= ~(I32_FLAG(UTEXT_PROVIDER_WRITABLE)); 476 } 477 478 479 U_CAPI UBool U_EXPORT2 480 utext_hasMetaData(const UText *ut) 481 { 482 UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA)) != 0; 483 return b; 484 } 485 486 487 488 U_CAPI int32_t U_EXPORT2 489 utext_replace(UText *ut, 490 int64_t nativeStart, int64_t nativeLimit, 491 const char16_t *replacementText, int32_t replacementLength, 492 UErrorCode *status) 493 { 494 if (U_FAILURE(*status)) { 495 return 0; 496 } 497 if ((ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) == 0) { 498 *status = U_NO_WRITE_PERMISSION; 499 return 0; 500 } 501 int32_t i = ut->pFuncs->replace(ut, nativeStart, nativeLimit, replacementText, replacementLength, status); 502 return i; 503 } 504 505 U_CAPI void U_EXPORT2 506 utext_copy(UText *ut, 507 int64_t nativeStart, int64_t nativeLimit, 508 int64_t destIndex, 509 UBool move, 510 UErrorCode *status) 511 { 512 if (U_FAILURE(*status)) { 513 return; 514 } 515 if ((ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) == 0) { 516 *status = U_NO_WRITE_PERMISSION; 517 return; 518 } 519 ut->pFuncs->copy(ut, nativeStart, nativeLimit, destIndex, move, status); 520 } 521 522 523 524 U_CAPI UText * U_EXPORT2 525 utext_clone(UText *dest, const UText *src, UBool deep, UBool readOnly, UErrorCode *status) { 526 if (U_FAILURE(*status)) { 527 return dest; 528 } 529 UText *result = src->pFuncs->clone(dest, src, deep, status); 530 if (U_FAILURE(*status)) { 531 return result; 532 } 533 if (result == nullptr) { 534 *status = U_MEMORY_ALLOCATION_ERROR; 535 return result; 536 } 537 if (readOnly) { 538 utext_freeze(result); 539 } 540 return result; 541 } 542 543 544 545 //------------------------------------------------------------------------------ 546 // 547 // UText common functions implementation 548 // 549 //------------------------------------------------------------------------------ 550 551 // 552 // UText.flags bit definitions 553 // 554 enum { 555 UTEXT_HEAP_ALLOCATED = 1, // 1 if ICU has allocated this UText struct on the heap. 556 // 0 if caller provided storage for the UText. 557 558 UTEXT_EXTRA_HEAP_ALLOCATED = 2, // 1 if ICU has allocated extra storage as a separate 559 // heap block. 560 // 0 if there is no separate allocation. Either no extra 561 // storage was requested, or it is appended to the end 562 // of the main UText storage. 563 564 UTEXT_OPEN = 4 // 1 if this UText is currently open 565 // 0 if this UText is not open. 566 }; 567 568 569 // 570 // Extended form of a UText. The purpose is to aid in computing the total size required 571 // when a provider asks for a UText to be allocated with extra storage. 572 573 struct ExtendedUText { 574 UText ut; 575 std::max_align_t extension; 576 }; 577 578 static const UText emptyText = UTEXT_INITIALIZER; 579 580 U_CAPI UText * U_EXPORT2 581 utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status) { 582 if (U_FAILURE(*status)) { 583 return ut; 584 } 585 586 if (ut == nullptr) { 587 // We need to heap-allocate storage for the new UText 588 int32_t spaceRequired = sizeof(UText); 589 if (extraSpace > 0) { 590 spaceRequired = sizeof(ExtendedUText) + extraSpace - sizeof(std::max_align_t); 591 } 592 ut = (UText *)uprv_malloc(spaceRequired); 593 if (ut == nullptr) { 594 *status = U_MEMORY_ALLOCATION_ERROR; 595 return nullptr; 596 } else { 597 *ut = emptyText; 598 ut->flags |= UTEXT_HEAP_ALLOCATED; 599 if (spaceRequired>0) { 600 ut->extraSize = extraSpace; 601 ut->pExtra = &((ExtendedUText *)ut)->extension; 602 } 603 } 604 } else { 605 // We have been supplied with an already existing UText. 606 // Verify that it really appears to be a UText. 607 if (ut->magic != UTEXT_MAGIC) { 608 *status = U_ILLEGAL_ARGUMENT_ERROR; 609 return ut; 610 } 611 // If the ut is already open and there's a provider supplied close 612 // function, call it. 613 if ((ut->flags & UTEXT_OPEN) && ut->pFuncs->close != nullptr) { 614 ut->pFuncs->close(ut); 615 } 616 ut->flags &= ~UTEXT_OPEN; 617 618 // If extra space was requested by our caller, check whether 619 // sufficient already exists, and allocate new if needed. 620 if (extraSpace > ut->extraSize) { 621 // Need more space. If there is existing separately allocated space, 622 // delete it first, then allocate new space. 623 if (ut->flags & UTEXT_EXTRA_HEAP_ALLOCATED) { 624 uprv_free(ut->pExtra); 625 ut->extraSize = 0; 626 } 627 ut->pExtra = uprv_malloc(extraSpace); 628 if (ut->pExtra == nullptr) { 629 *status = U_MEMORY_ALLOCATION_ERROR; 630 } else { 631 ut->extraSize = extraSpace; 632 ut->flags |= UTEXT_EXTRA_HEAP_ALLOCATED; 633 } 634 } 635 } 636 if (U_SUCCESS(*status)) { 637 ut->flags |= UTEXT_OPEN; 638 639 // Initialize all remaining fields of the UText. 640 // 641 ut->context = nullptr; 642 ut->chunkContents = nullptr; 643 ut->p = nullptr; 644 ut->q = nullptr; 645 ut->r = nullptr; 646 ut->a = 0; 647 ut->b = 0; 648 ut->c = 0; 649 ut->chunkOffset = 0; 650 ut->chunkLength = 0; 651 ut->chunkNativeStart = 0; 652 ut->chunkNativeLimit = 0; 653 ut->nativeIndexingLimit = 0; 654 ut->providerProperties = 0; 655 ut->privA = 0; 656 ut->privB = 0; 657 ut->privC = 0; 658 ut->privP = nullptr; 659 if (ut->pExtra!=nullptr && ut->extraSize>0) 660 uprv_memset(ut->pExtra, 0, ut->extraSize); 661 662 } 663 return ut; 664 } 665 666 667 U_CAPI UText * U_EXPORT2 668 utext_close(UText *ut) { 669 if (ut==nullptr || 670 ut->magic != UTEXT_MAGIC || 671 (ut->flags & UTEXT_OPEN) == 0) 672 { 673 // The supplied ut is not an open UText. 674 // Do nothing. 675 return ut; 676 } 677 678 // If the provider gave us a close function, call it now. 679 // This will clean up anything allocated specifically by the provider. 680 if (ut->pFuncs->close != nullptr) { 681 ut->pFuncs->close(ut); 682 } 683 ut->flags &= ~UTEXT_OPEN; 684 685 // If we (the framework) allocated the UText or subsidiary storage, 686 // delete it. 687 if (ut->flags & UTEXT_EXTRA_HEAP_ALLOCATED) { 688 uprv_free(ut->pExtra); 689 ut->pExtra = nullptr; 690 ut->flags &= ~UTEXT_EXTRA_HEAP_ALLOCATED; 691 ut->extraSize = 0; 692 } 693 694 // Zero out function table of the closed UText. This is a defensive move, 695 // intended to cause applications that inadvertently use a closed 696 // utext to crash with null pointer errors. 697 ut->pFuncs = nullptr; 698 699 if (ut->flags & UTEXT_HEAP_ALLOCATED) { 700 // This UText was allocated by UText setup. We need to free it. 701 // Clear magic, so we can detect if the user messes up and immediately 702 // tries to reopen another UText using the deleted storage. 703 ut->magic = 0; 704 uprv_free(ut); 705 ut = nullptr; 706 } 707 return ut; 708 } 709 710 711 712 713 // 714 // invalidateChunk Reset a chunk to have no contents, so that the next call 715 // to access will cause new data to load. 716 // This is needed when copy/move/replace operate directly on the 717 // backing text, potentially putting it out of sync with the 718 // contents in the chunk. 719 // 720 static void 721 invalidateChunk(UText *ut) { 722 ut->chunkLength = 0; 723 ut->chunkNativeLimit = 0; 724 ut->chunkNativeStart = 0; 725 ut->chunkOffset = 0; 726 ut->nativeIndexingLimit = 0; 727 } 728 729 // 730 // pinIndex Do range pinning on a native index parameter. 731 // 64 bit pinning is done in place. 732 // 32 bit truncated result is returned as a convenience for 733 // use in providers that don't need 64 bits. 734 static int32_t 735 pinIndex(int64_t &index, int64_t limit) { 736 if (index<0) { 737 index = 0; 738 } else if (index > limit) { 739 index = limit; 740 } 741 return static_cast<int32_t>(index); 742 } 743 744 745 U_CDECL_BEGIN 746 747 // 748 // Pointer relocation function, 749 // a utility used by shallow clone. 750 // Adjust a pointer that refers to something within one UText (the source) 751 // to refer to the same relative offset within a another UText (the target) 752 // 753 static void adjustPointer(UText *dest, const void **destPtr, const UText *src) { 754 // convert all pointers to (char *) so that byte address arithmetic will work. 755 char *dptr = (char *)*destPtr; 756 char *dUText = (char *)dest; 757 char *sUText = (char *)src; 758 759 if (dptr >= (char *)src->pExtra && dptr < ((char*)src->pExtra)+src->extraSize) { 760 // target ptr was to something within the src UText's pExtra storage. 761 // relocate it into the target UText's pExtra region. 762 *destPtr = ((char *)dest->pExtra) + (dptr - (char *)src->pExtra); 763 } else if (dptr>=sUText && dptr < sUText+src->sizeOfStruct) { 764 // target ptr was pointing to somewhere within the source UText itself. 765 // Move it to the same offset within the target UText. 766 *destPtr = dUText + (dptr-sUText); 767 } 768 } 769 770 771 // 772 // Clone. This is a generic copy-the-utext-by-value clone function that can be 773 // used as-is with some utext types, and as a helper by other clones. 774 // 775 static UText * U_CALLCONV 776 shallowTextClone(UText * dest, const UText * src, UErrorCode * status) { 777 if (U_FAILURE(*status)) { 778 return nullptr; 779 } 780 int32_t srcExtraSize = src->extraSize; 781 782 // 783 // Use the generic text_setup to allocate storage if required. 784 // 785 dest = utext_setup(dest, srcExtraSize, status); 786 if (U_FAILURE(*status)) { 787 return dest; 788 } 789 790 // 791 // flags (how the UText was allocated) and the pointer to the 792 // extra storage must retain the values in the cloned utext that 793 // were set up by utext_setup. Save them separately before 794 // copying the whole struct. 795 // 796 void *destExtra = dest->pExtra; 797 int32_t flags = dest->flags; 798 799 800 // 801 // Copy the whole UText struct by value. 802 // Any "Extra" storage is copied also. 803 // 804 int sizeToCopy = src->sizeOfStruct; 805 if (sizeToCopy > dest->sizeOfStruct) { 806 sizeToCopy = dest->sizeOfStruct; 807 } 808 uprv_memcpy(dest, src, sizeToCopy); 809 dest->pExtra = destExtra; 810 dest->flags = flags; 811 if (srcExtraSize > 0) { 812 uprv_memcpy(dest->pExtra, src->pExtra, srcExtraSize); 813 } 814 815 // 816 // Relocate any pointers in the target that refer to the UText itself 817 // to point to the cloned copy rather than the original source. 818 // 819 adjustPointer(dest, &dest->context, src); 820 adjustPointer(dest, &dest->p, src); 821 adjustPointer(dest, &dest->q, src); 822 adjustPointer(dest, &dest->r, src); 823 adjustPointer(dest, (const void **)&dest->chunkContents, src); 824 825 // The newly shallow-cloned UText does _not_ own the underlying storage for the text. 826 // (The source for the clone may or may not have owned the text.) 827 828 dest->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT); 829 830 return dest; 831 } 832 833 834 U_CDECL_END 835 836 837 838 //------------------------------------------------------------------------------ 839 // 840 // UText implementation for UTF-8 char * strings (read-only) 841 // Limitation: string length must be <= 0x7fffffff in length. 842 // (length must for in an int32_t variable) 843 // 844 // Use of UText data members: 845 // context pointer to UTF-8 string 846 // utext.b is the input string length (bytes). 847 // utext.c Length scanned so far in string 848 // (for optimizing finding length of zero terminated strings.) 849 // utext.p pointer to the current buffer 850 // utext.q pointer to the other buffer. 851 // 852 //------------------------------------------------------------------------------ 853 854 // Chunk size. 855 // Must be less than 85 (256/3), because of byte mapping from char16_t indexes to native indexes. 856 // Worst case is three native bytes to one char16_t. (Supplemenaries are 4 native bytes 857 // to two UChars.) 858 // The longest illegal byte sequence treated as a single error (and converted to U+FFFD) 859 // is a three-byte sequence (truncated four-byte sequence). 860 // 861 enum { UTF8_TEXT_CHUNK_SIZE=32 }; 862 863 // 864 // UTF8Buf Two of these structs will be set up in the UText's extra allocated space. 865 // Each contains the char16_t chunk buffer, the to and from native maps, and 866 // header info. 867 // 868 // because backwards iteration fills the buffers starting at the end and 869 // working towards the front, the filled part of the buffers may not begin 870 // at the start of the available storage for the buffers. 871 // 872 // Buffer size is one bigger than the specified UTF8_TEXT_CHUNK_SIZE to allow for 873 // the last character added being a supplementary, and thus requiring a surrogate 874 // pair. Doing this is simpler than checking for the edge case. 875 // 876 877 struct UTF8Buf { 878 int32_t bufNativeStart; // Native index of first char in char16_t buf 879 int32_t bufNativeLimit; // Native index following last char in buf. 880 int32_t bufStartIdx; // First filled position in buf. 881 int32_t bufLimitIdx; // Limit of filled range in buf. 882 int32_t bufNILimit; // Limit of native indexing part of buf 883 int32_t toUCharsMapStart; // Native index corresponding to 884 // mapToUChars[0]. 885 // Set to bufNativeStart when filling forwards. 886 // Set to computed value when filling backwards. 887 888 char16_t buf[UTF8_TEXT_CHUNK_SIZE+4]; // The char16_t buffer. Requires one extra position beyond the 889 // the chunk size, to allow for surrogate at the end. 890 // Length must be identical to mapToNative array, below, 891 // because of the way indexing works when the array is 892 // filled backwards during a reverse iteration. Thus, 893 // the additional extra size. 894 uint8_t mapToNative[UTF8_TEXT_CHUNK_SIZE+4]; // map char16_t index in buf to 895 // native offset from bufNativeStart. 896 // Requires two extra slots, 897 // one for a supplementary starting in the last normal position, 898 // and one for an entry for the buffer limit position. 899 uint8_t mapToUChars[UTF8_TEXT_CHUNK_SIZE*3+6]; // Map native offset from bufNativeStart to 900 // corresponding offset in filled part of buf. 901 int32_t align; 902 }; 903 904 U_CDECL_BEGIN 905 906 // 907 // utf8TextLength 908 // 909 // Get the length of the string. If we don't already know it, 910 // we'll need to scan for the trailing nul. 911 // 912 static int64_t U_CALLCONV 913 utf8TextLength(UText *ut) { 914 if (ut->b < 0) { 915 // Zero terminated string, and we haven't scanned to the end yet. 916 // Scan it now. 917 const char *r = (const char *)ut->context + ut->c; 918 while (*r != 0) { 919 r++; 920 } 921 if ((r - (const char *)ut->context) < 0x7fffffff) { 922 ut->b = (int32_t)(r - (const char *)ut->context); 923 } else { 924 // Actual string was bigger (more than 2 gig) than we 925 // can handle. Clip it to 2 GB. 926 ut->b = 0x7fffffff; 927 } 928 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); 929 } 930 return ut->b; 931 } 932 933 934 935 936 937 938 static UBool U_CALLCONV 939 utf8TextAccess(UText *ut, int64_t index, UBool forward) { 940 // 941 // Apologies to those who are allergic to goto statements. 942 // Consider each goto to a labelled block to be the equivalent of 943 // call the named block as if it were a function(); 944 // return; 945 // 946 const uint8_t *s8=(const uint8_t *)ut->context; 947 UTF8Buf *u8b = nullptr; 948 int32_t length = ut->b; // Length of original utf-8 949 int32_t ix= (int32_t)index; // Requested index, trimmed to 32 bits. 950 int32_t mapIndex = 0; 951 if (index<0) { 952 ix=0; 953 } else if (index > 0x7fffffff) { 954 // Strings with 64 bit lengths not supported by this UTF-8 provider. 955 ix = 0x7fffffff; 956 } 957 958 // Pin requested index to the string length. 959 if (ix>length) { 960 if (length>=0) { 961 ix=length; 962 } else if (ix>=ut->c) { 963 // Zero terminated string, and requested index is beyond 964 // the region that has already been scanned. 965 // Scan up to either the end of the string or to the 966 // requested position, whichever comes first. 967 while (ut->c<ix && s8[ut->c]!=0) { 968 ut->c++; 969 } 970 // TODO: support for null terminated string length > 32 bits. 971 if (s8[ut->c] == 0) { 972 // We just found the actual length of the string. 973 // Trim the requested index back to that. 974 ix = ut->c; 975 ut->b = ut->c; 976 length = ut->c; 977 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); 978 } 979 } 980 } 981 982 // 983 // Dispatch to the appropriate action for a forward iteration request. 984 // 985 if (forward) { 986 if (ix==ut->chunkNativeLimit) { 987 // Check for normal sequential iteration cases first. 988 if (ix==length) { 989 // Just reached end of string 990 // Don't swap buffers, but do set the 991 // current buffer position. 992 ut->chunkOffset = ut->chunkLength; 993 return false; 994 } else { 995 // End of current buffer. 996 // check whether other buffer already has what we need. 997 UTF8Buf *altB = (UTF8Buf *)ut->q; 998 if (ix>=altB->bufNativeStart && ix<altB->bufNativeLimit) { 999 goto swapBuffers; 1000 } 1001 } 1002 } 1003 1004 // A random access. Desired index could be in either or niether buf. 1005 // For optimizing the order of testing, first check for the index 1006 // being in the other buffer. This will be the case for uses that 1007 // move back and forth over a fairly limited range 1008 { 1009 u8b = (UTF8Buf *)ut->q; // the alternate buffer 1010 if (ix>=u8b->bufNativeStart && ix<u8b->bufNativeLimit) { 1011 // Requested index is in the other buffer. 1012 goto swapBuffers; 1013 } 1014 if (ix == length) { 1015 // Requested index is end-of-string. 1016 // (this is the case of randomly seeking to the end. 1017 // The case of iterating off the end is handled earlier.) 1018 if (ix == ut->chunkNativeLimit) { 1019 // Current buffer extends up to the end of the string. 1020 // Leave it as the current buffer. 1021 ut->chunkOffset = ut->chunkLength; 1022 return false; 1023 } 1024 if (ix == u8b->bufNativeLimit) { 1025 // Alternate buffer extends to the end of string. 1026 // Swap it in as the current buffer. 1027 goto swapBuffersAndFail; 1028 } 1029 1030 // Neither existing buffer extends to the end of the string. 1031 goto makeStubBuffer; 1032 } 1033 1034 if (ix<ut->chunkNativeStart || ix>=ut->chunkNativeLimit) { 1035 // Requested index is in neither buffer. 1036 goto fillForward; 1037 } 1038 1039 // Requested index is in this buffer. 1040 u8b = (UTF8Buf *)ut->p; // the current buffer 1041 mapIndex = ix - u8b->toUCharsMapStart; 1042 U_ASSERT(mapIndex < (int32_t)sizeof(UTF8Buf::mapToUChars)); 1043 ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx; 1044 return true; 1045 1046 } 1047 } 1048 1049 1050 // 1051 // Dispatch to the appropriate action for a 1052 // Backwards Direction iteration request. 1053 // 1054 if (ix==ut->chunkNativeStart) { 1055 // Check for normal sequential iteration cases first. 1056 if (ix==0) { 1057 // Just reached the start of string 1058 // Don't swap buffers, but do set the 1059 // current buffer position. 1060 ut->chunkOffset = 0; 1061 return false; 1062 } else { 1063 // Start of current buffer. 1064 // check whether other buffer already has what we need. 1065 UTF8Buf *altB = (UTF8Buf *)ut->q; 1066 if (ix>altB->bufNativeStart && ix<=altB->bufNativeLimit) { 1067 goto swapBuffers; 1068 } 1069 } 1070 } 1071 1072 // A random access. Desired index could be in either or niether buf. 1073 // For optimizing the order of testing, 1074 // Most likely case: in the other buffer. 1075 // Second most likely: in neither buffer. 1076 // Unlikely, but must work: in the current buffer. 1077 u8b = (UTF8Buf *)ut->q; // the alternate buffer 1078 if (ix>u8b->bufNativeStart && ix<=u8b->bufNativeLimit) { 1079 // Requested index is in the other buffer. 1080 goto swapBuffers; 1081 } 1082 // Requested index is start-of-string. 1083 // (this is the case of randomly seeking to the start. 1084 // The case of iterating off the start is handled earlier.) 1085 if (ix==0) { 1086 if (u8b->bufNativeStart==0) { 1087 // Alternate buffer contains the data for the start string. 1088 // Make it be the current buffer. 1089 goto swapBuffersAndFail; 1090 } else { 1091 // Request for data before the start of string, 1092 // neither buffer is usable. 1093 // set up a zero-length buffer. 1094 goto makeStubBuffer; 1095 } 1096 } 1097 1098 if (ix<=ut->chunkNativeStart || ix>ut->chunkNativeLimit) { 1099 // Requested index is in neither buffer. 1100 goto fillReverse; 1101 } 1102 1103 // Requested index is in this buffer. 1104 // Set the utf16 buffer index. 1105 u8b = (UTF8Buf *)ut->p; 1106 mapIndex = ix - u8b->toUCharsMapStart; 1107 ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx; 1108 if (ut->chunkOffset==0) { 1109 // This occurs when the first character in the text is 1110 // a multi-byte UTF-8 char, and the requested index is to 1111 // one of the trailing bytes. Because there is no preceding , 1112 // character, this access fails. We can't pick up on the 1113 // situation sooner because the requested index is not zero. 1114 return false; 1115 } else { 1116 return true; 1117 } 1118 1119 1120 1121 swapBuffers: 1122 // The alternate buffer (ut->q) has the string data that was requested. 1123 // Swap the primary and alternate buffers, and set the 1124 // chunk index into the new primary buffer. 1125 { 1126 u8b = (UTF8Buf *)ut->q; 1127 ut->q = ut->p; 1128 ut->p = u8b; 1129 ut->chunkContents = &u8b->buf[u8b->bufStartIdx]; 1130 ut->chunkLength = u8b->bufLimitIdx - u8b->bufStartIdx; 1131 ut->chunkNativeStart = u8b->bufNativeStart; 1132 ut->chunkNativeLimit = u8b->bufNativeLimit; 1133 ut->nativeIndexingLimit = u8b->bufNILimit; 1134 1135 // Index into the (now current) chunk 1136 // Use the map to set the chunk index. It's more trouble than it's worth 1137 // to check whether native indexing can be used. 1138 U_ASSERT(ix>=u8b->bufNativeStart); 1139 U_ASSERT(ix<=u8b->bufNativeLimit); 1140 mapIndex = ix - u8b->toUCharsMapStart; 1141 U_ASSERT(mapIndex>=0); 1142 U_ASSERT(mapIndex<(int32_t)sizeof(u8b->mapToUChars)); 1143 ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx; 1144 1145 return true; 1146 } 1147 1148 1149 swapBuffersAndFail: 1150 // We got a request for either the start or end of the string, 1151 // with iteration continuing in the out-of-bounds direction. 1152 // The alternate buffer already contains the data up to the 1153 // start/end. 1154 // Swap the buffers, then return failure, indicating that we couldn't 1155 // make things correct for continuing the iteration in the requested 1156 // direction. The position & buffer are correct should the 1157 // user decide to iterate in the opposite direction. 1158 u8b = (UTF8Buf *)ut->q; 1159 ut->q = ut->p; 1160 ut->p = u8b; 1161 ut->chunkContents = &u8b->buf[u8b->bufStartIdx]; 1162 ut->chunkLength = u8b->bufLimitIdx - u8b->bufStartIdx; 1163 ut->chunkNativeStart = u8b->bufNativeStart; 1164 ut->chunkNativeLimit = u8b->bufNativeLimit; 1165 ut->nativeIndexingLimit = u8b->bufNILimit; 1166 1167 // Index into the (now current) chunk 1168 // For this function (swapBuffersAndFail), the requested index 1169 // will always be at either the start or end of the chunk. 1170 if (ix==u8b->bufNativeLimit) { 1171 ut->chunkOffset = ut->chunkLength; 1172 } else { 1173 ut->chunkOffset = 0; 1174 U_ASSERT(ix == u8b->bufNativeStart); 1175 } 1176 return false; 1177 1178 makeStubBuffer: 1179 // The user has done a seek/access past the start or end 1180 // of the string. Rather than loading data that is likely 1181 // to never be used, just set up a zero-length buffer at 1182 // the position. 1183 u8b = (UTF8Buf *)ut->q; 1184 u8b->bufNativeStart = ix; 1185 u8b->bufNativeLimit = ix; 1186 u8b->bufStartIdx = 0; 1187 u8b->bufLimitIdx = 0; 1188 u8b->bufNILimit = 0; 1189 u8b->toUCharsMapStart = ix; 1190 u8b->mapToNative[0] = 0; 1191 u8b->mapToUChars[0] = 0; 1192 goto swapBuffersAndFail; 1193 1194 1195 1196 fillForward: 1197 { 1198 // Move the incoming index to a code point boundary. 1199 U8_SET_CP_START(s8, 0, ix); 1200 1201 // Swap the UText buffers. 1202 // We want to fill what was previously the alternate buffer, 1203 // and make what was the current buffer be the new alternate. 1204 UTF8Buf *u8b_swap = (UTF8Buf *)ut->q; 1205 ut->q = ut->p; 1206 ut->p = u8b_swap; 1207 1208 int32_t strLen = ut->b; 1209 UBool nulTerminated = false; 1210 if (strLen < 0) { 1211 strLen = 0x7fffffff; 1212 nulTerminated = true; 1213 } 1214 1215 char16_t *buf = u8b_swap->buf; 1216 uint8_t *mapToNative = u8b_swap->mapToNative; 1217 uint8_t *mapToUChars = u8b_swap->mapToUChars; 1218 int32_t destIx = 0; 1219 int32_t srcIx = ix; 1220 UBool seenNonAscii = false; 1221 UChar32 c = 0; 1222 1223 // Fill the chunk buffer and mapping arrays. 1224 while (destIx<UTF8_TEXT_CHUNK_SIZE) { 1225 c = s8[srcIx]; 1226 if (c>0 && c<0x80) { 1227 // Special case ASCII range for speed. 1228 // zero is excluded to simplify bounds checking. 1229 buf[destIx] = (char16_t)c; 1230 mapToNative[destIx] = (uint8_t)(srcIx - ix); 1231 mapToUChars[srcIx-ix] = (uint8_t)destIx; 1232 srcIx++; 1233 destIx++; 1234 } else { 1235 // General case, handle everything. 1236 if (seenNonAscii == false) { 1237 seenNonAscii = true; 1238 u8b_swap->bufNILimit = destIx; 1239 } 1240 1241 int32_t cIx = srcIx; 1242 int32_t dIx = destIx; 1243 int32_t dIxSaved = destIx; 1244 U8_NEXT_OR_FFFD(s8, srcIx, strLen, c); 1245 if (c==0 && nulTerminated) { 1246 srcIx--; 1247 break; 1248 } 1249 1250 U16_APPEND_UNSAFE(buf, destIx, c); 1251 do { 1252 mapToNative[dIx++] = (uint8_t)(cIx - ix); 1253 } while (dIx < destIx); 1254 1255 do { 1256 mapToUChars[cIx++ - ix] = (uint8_t)dIxSaved; 1257 } while (cIx < srcIx); 1258 } 1259 if (srcIx>=strLen) { 1260 break; 1261 } 1262 1263 } 1264 1265 // store Native <--> Chunk Map entries for the end of the buffer. 1266 // There is no actual character here, but the index position is valid. 1267 mapToNative[destIx] = (uint8_t)(srcIx - ix); 1268 mapToUChars[srcIx - ix] = (uint8_t)destIx; 1269 1270 // fill in Buffer descriptor 1271 u8b_swap->bufNativeStart = ix; 1272 u8b_swap->bufNativeLimit = srcIx; 1273 u8b_swap->bufStartIdx = 0; 1274 u8b_swap->bufLimitIdx = destIx; 1275 if (seenNonAscii == false) { 1276 u8b_swap->bufNILimit = destIx; 1277 } 1278 u8b_swap->toUCharsMapStart = u8b_swap->bufNativeStart; 1279 1280 // Set UText chunk to refer to this buffer. 1281 ut->chunkContents = buf; 1282 ut->chunkOffset = 0; 1283 ut->chunkLength = u8b_swap->bufLimitIdx; 1284 ut->chunkNativeStart = u8b_swap->bufNativeStart; 1285 ut->chunkNativeLimit = u8b_swap->bufNativeLimit; 1286 ut->nativeIndexingLimit = u8b_swap->bufNILimit; 1287 1288 // For zero terminated strings, keep track of the maximum point 1289 // scanned so far. 1290 if (nulTerminated && srcIx>ut->c) { 1291 ut->c = srcIx; 1292 if (c==0) { 1293 // We scanned to the end. 1294 // Remember the actual length. 1295 ut->b = srcIx; 1296 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); 1297 } 1298 } 1299 return true; 1300 } 1301 1302 1303 fillReverse: 1304 { 1305 // Move the incoming index to a code point boundary. 1306 // Can only do this if the incoming index is somewhere in the interior of the string. 1307 // If index is at the end, there is no character there to look at. 1308 if (ix != ut->b) { 1309 // Note: this function will only move the index back if it is on a trail byte 1310 // and there is a preceding lead byte and the sequence from the lead 1311 // through this trail could be part of a valid UTF-8 sequence 1312 // Otherwise the index remains unchanged. 1313 U8_SET_CP_START(s8, 0, ix); 1314 } 1315 1316 // Swap the UText buffers. 1317 // We want to fill what was previously the alternate buffer, 1318 // and make what was the current buffer be the new alternate. 1319 UTF8Buf *u8b_swap = (UTF8Buf *)ut->q; 1320 ut->q = ut->p; 1321 ut->p = u8b_swap; 1322 1323 char16_t *buf = u8b_swap->buf; 1324 uint8_t *mapToNative = u8b_swap->mapToNative; 1325 uint8_t *mapToUChars = u8b_swap->mapToUChars; 1326 int32_t toUCharsMapStart = ix - sizeof(UTF8Buf::mapToUChars) + 1; 1327 // Note that toUCharsMapStart can be negative. Happens when the remaining 1328 // text from current position to the beginning is less than the buffer size. 1329 // + 1 because mapToUChars must have a slot at the end for the bufNativeLimit entry. 1330 int32_t destIx = UTF8_TEXT_CHUNK_SIZE+2; // Start in the overflow region 1331 // at end of buffer to leave room 1332 // for a surrogate pair at the 1333 // buffer start. 1334 int32_t srcIx = ix; 1335 int32_t bufNILimit = destIx; 1336 UChar32 c; 1337 1338 // Map to/from Native Indexes, fill in for the position at the end of 1339 // the buffer. 1340 // 1341 mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart); 1342 mapToUChars[srcIx - toUCharsMapStart] = (uint8_t)destIx; 1343 1344 // Fill the chunk buffer 1345 // Work backwards, filling from the end of the buffer towards the front. 1346 // 1347 while (destIx>2 && (srcIx - toUCharsMapStart > 5) && (srcIx > 0)) { 1348 srcIx--; 1349 destIx--; 1350 1351 // Get last byte of the UTF-8 character 1352 c = s8[srcIx]; 1353 if (c<0x80) { 1354 // Special case ASCII range for speed. 1355 buf[destIx] = (char16_t)c; 1356 U_ASSERT(toUCharsMapStart <= srcIx); 1357 mapToUChars[srcIx - toUCharsMapStart] = (uint8_t)destIx; 1358 mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart); 1359 } else { 1360 // General case, handle everything non-ASCII. 1361 1362 int32_t sIx = srcIx; // ix of last byte of multi-byte u8 char 1363 1364 // Get the full character from the UTF8 string. 1365 // use code derived from the macros in utf8.h 1366 // Leaves srcIx pointing at the first byte of the UTF-8 char. 1367 // 1368 c=utf8_prevCharSafeBody(s8, 0, &srcIx, c, -3); 1369 // leaves srcIx at first byte of the multi-byte char. 1370 1371 // Store the character in UTF-16 buffer. 1372 if (c<0x10000) { 1373 buf[destIx] = (char16_t)c; 1374 mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart); 1375 } else { 1376 buf[destIx] = U16_TRAIL(c); 1377 mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart); 1378 buf[--destIx] = U16_LEAD(c); 1379 mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart); 1380 } 1381 1382 // Fill in the map from native indexes to UChars buf index. 1383 do { 1384 mapToUChars[sIx-- - toUCharsMapStart] = (uint8_t)destIx; 1385 } while (sIx >= srcIx); 1386 U_ASSERT(toUCharsMapStart <= (srcIx+1)); 1387 1388 // Set native indexing limit to be the current position. 1389 // We are processing a non-ascii, non-native-indexing char now; 1390 // the limit will be here if the rest of the chars to be 1391 // added to this buffer are ascii. 1392 bufNILimit = destIx; 1393 } 1394 } 1395 u8b_swap->bufNativeStart = srcIx; 1396 u8b_swap->bufNativeLimit = ix; 1397 u8b_swap->bufStartIdx = destIx; 1398 u8b_swap->bufLimitIdx = UTF8_TEXT_CHUNK_SIZE+2; 1399 u8b_swap->bufNILimit = bufNILimit - u8b_swap->bufStartIdx; 1400 u8b_swap->toUCharsMapStart = toUCharsMapStart; 1401 1402 ut->chunkContents = &buf[u8b_swap->bufStartIdx]; 1403 ut->chunkLength = u8b_swap->bufLimitIdx - u8b_swap->bufStartIdx; 1404 ut->chunkOffset = ut->chunkLength; 1405 ut->chunkNativeStart = u8b_swap->bufNativeStart; 1406 ut->chunkNativeLimit = u8b_swap->bufNativeLimit; 1407 ut->nativeIndexingLimit = u8b_swap->bufNILimit; 1408 return true; 1409 } 1410 1411 } 1412 1413 1414 1415 // 1416 // This is a slightly modified copy of u_strFromUTF8, 1417 // Inserts a Replacement Char rather than failing on invalid UTF-8 1418 // Removes unnecessary features. 1419 // 1420 static char16_t* 1421 utext_strFromUTF8(char16_t *dest, 1422 int32_t destCapacity, 1423 int32_t *pDestLength, 1424 const char* src, 1425 int32_t srcLength, // required. NUL terminated not supported. 1426 UErrorCode *pErrorCode 1427 ) 1428 { 1429 1430 char16_t *pDest = dest; 1431 char16_t *pDestLimit = (dest!=nullptr)?(dest+destCapacity):nullptr; 1432 UChar32 ch=0; 1433 int32_t index = 0; 1434 int32_t reqLength = 0; 1435 uint8_t* pSrc = (uint8_t*) src; 1436 1437 1438 while((index < srcLength)&&(pDest<pDestLimit)){ 1439 ch = pSrc[index++]; 1440 if(ch <=0x7f){ 1441 *pDest++=(char16_t)ch; 1442 }else{ 1443 ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -3); 1444 if(U_IS_BMP(ch)){ 1445 *(pDest++)=(char16_t)ch; 1446 }else{ 1447 *(pDest++)=U16_LEAD(ch); 1448 if(pDest<pDestLimit){ 1449 *(pDest++)=U16_TRAIL(ch); 1450 }else{ 1451 reqLength++; 1452 break; 1453 } 1454 } 1455 } 1456 } 1457 /* donot fill the dest buffer just count the UChars needed */ 1458 while(index < srcLength){ 1459 ch = pSrc[index++]; 1460 if(ch <= 0x7f){ 1461 reqLength++; 1462 }else{ 1463 ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -3); 1464 reqLength+=U16_LENGTH(ch); 1465 } 1466 } 1467 1468 reqLength+=(int32_t)(pDest - dest); 1469 1470 if(pDestLength){ 1471 *pDestLength = reqLength; 1472 } 1473 1474 /* Terminate the buffer */ 1475 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); 1476 1477 return dest; 1478 } 1479 1480 1481 1482 static int32_t U_CALLCONV 1483 utf8TextExtract(UText *ut, 1484 int64_t start, int64_t limit, 1485 char16_t *dest, int32_t destCapacity, 1486 UErrorCode *pErrorCode) { 1487 if(U_FAILURE(*pErrorCode)) { 1488 return 0; 1489 } 1490 if(destCapacity<0 || (dest==nullptr && destCapacity>0)) { 1491 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1492 return 0; 1493 } 1494 int32_t length = ut->b; 1495 int32_t start32 = pinIndex(start, length); 1496 int32_t limit32 = pinIndex(limit, length); 1497 1498 if(start32>limit32) { 1499 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 1500 return 0; 1501 } 1502 1503 1504 // adjust the incoming indexes to land on code point boundaries if needed. 1505 // adjust by no more than three, because that is the largest number of trail bytes 1506 // in a well formed UTF8 character. 1507 const uint8_t *buf = (const uint8_t *)ut->context; 1508 int i; 1509 if (start32 < ut->chunkNativeLimit) { 1510 for (i=0; i<3; i++) { 1511 if (U8_IS_SINGLE(buf[start32]) || U8_IS_LEAD(buf[start32]) || start32==0) { 1512 break; 1513 } 1514 start32--; 1515 } 1516 } 1517 1518 if (limit32 < ut->chunkNativeLimit) { 1519 for (i=0; i<3; i++) { 1520 if (U8_IS_SINGLE(buf[limit32]) || U8_IS_LEAD(buf[limit32]) || limit32==0) { 1521 break; 1522 } 1523 limit32--; 1524 } 1525 } 1526 1527 // Do the actual extract. 1528 int32_t destLength=0; 1529 utext_strFromUTF8(dest, destCapacity, &destLength, 1530 (const char *)ut->context+start32, limit32-start32, 1531 pErrorCode); 1532 utf8TextAccess(ut, limit32, true); 1533 return destLength; 1534 } 1535 1536 // 1537 // utf8TextMapOffsetToNative 1538 // 1539 // Map a chunk (UTF-16) offset to a native index. 1540 static int64_t U_CALLCONV 1541 utf8TextMapOffsetToNative(const UText *ut) { 1542 // 1543 UTF8Buf *u8b = (UTF8Buf *)ut->p; 1544 U_ASSERT(ut->chunkOffset>ut->nativeIndexingLimit && ut->chunkOffset<=ut->chunkLength); 1545 int32_t nativeOffset = u8b->mapToNative[ut->chunkOffset + u8b->bufStartIdx] + u8b->toUCharsMapStart; 1546 U_ASSERT(nativeOffset >= ut->chunkNativeStart && nativeOffset <= ut->chunkNativeLimit); 1547 return nativeOffset; 1548 } 1549 1550 // 1551 // Map a native index to the corresponding chunk offset 1552 // 1553 static int32_t U_CALLCONV 1554 utf8TextMapIndexToUTF16(const UText *ut, int64_t index64) { 1555 U_ASSERT(index64 <= 0x7fffffff); 1556 int32_t index = (int32_t)index64; 1557 UTF8Buf *u8b = (UTF8Buf *)ut->p; 1558 U_ASSERT(index>=ut->chunkNativeStart+ut->nativeIndexingLimit); 1559 U_ASSERT(index<=ut->chunkNativeLimit); 1560 int32_t mapIndex = index - u8b->toUCharsMapStart; 1561 U_ASSERT(mapIndex < (int32_t)sizeof(UTF8Buf::mapToUChars)); 1562 int32_t offset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx; 1563 U_ASSERT(offset>=0 && offset<=ut->chunkLength); 1564 return offset; 1565 } 1566 1567 static UText * U_CALLCONV 1568 utf8TextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status) 1569 { 1570 // First do a generic shallow clone. Does everything needed for the UText struct itself. 1571 dest = shallowTextClone(dest, src, status); 1572 1573 // For deep clones, make a copy of the string. 1574 // The copied storage is owned by the newly created clone. 1575 // 1576 // TODO: There is an issue with using utext_nativeLength(). 1577 // That function is non-const in cases where the input was NUL terminated 1578 // and the length has not yet been determined. 1579 // This function (clone()) is const. 1580 // There potentially a thread safety issue lurking here. 1581 // 1582 if (deep && U_SUCCESS(*status)) { 1583 int32_t len = (int32_t)utext_nativeLength((UText *)src); 1584 char *copyStr = (char *)uprv_malloc(len+1); 1585 if (copyStr == nullptr) { 1586 *status = U_MEMORY_ALLOCATION_ERROR; 1587 } else { 1588 uprv_memcpy(copyStr, src->context, len+1); 1589 dest->context = copyStr; 1590 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT); 1591 } 1592 } 1593 return dest; 1594 } 1595 1596 1597 static void U_CALLCONV 1598 utf8TextClose(UText *ut) { 1599 // Most of the work of close is done by the generic UText framework close. 1600 // All that needs to be done here is to delete the UTF8 string if the UText 1601 // owns it. This occurs if the UText was created by cloning. 1602 if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) { 1603 char *s = (char *)ut->context; 1604 uprv_free(s); 1605 ut->context = nullptr; 1606 } 1607 } 1608 1609 U_CDECL_END 1610 1611 1612 static const struct UTextFuncs utf8Funcs = 1613 { 1614 sizeof(UTextFuncs), 1615 0, 0, 0, // Reserved alignment padding 1616 utf8TextClone, 1617 utf8TextLength, 1618 utf8TextAccess, 1619 utf8TextExtract, 1620 nullptr, /* replace*/ 1621 nullptr, /* copy */ 1622 utf8TextMapOffsetToNative, 1623 utf8TextMapIndexToUTF16, 1624 utf8TextClose, 1625 nullptr, // spare 1 1626 nullptr, // spare 2 1627 nullptr // spare 3 1628 }; 1629 1630 1631 static const char gEmptyString[] = {0}; 1632 1633 U_CAPI UText * U_EXPORT2 1634 utext_openUTF8(UText *ut, const char *s, int64_t length, UErrorCode *status) { 1635 if(U_FAILURE(*status)) { 1636 return nullptr; 1637 } 1638 if(s==nullptr && length==0) { 1639 s = gEmptyString; 1640 } 1641 1642 if(s==nullptr || length<-1 || length>INT32_MAX) { 1643 *status=U_ILLEGAL_ARGUMENT_ERROR; 1644 return nullptr; 1645 } 1646 1647 ut = utext_setup(ut, sizeof(UTF8Buf) * 2, status); 1648 if (U_FAILURE(*status)) { 1649 return ut; 1650 } 1651 1652 ut->pFuncs = &utf8Funcs; 1653 ut->context = s; 1654 ut->b = (int32_t)length; 1655 ut->c = (int32_t)length; 1656 if (ut->c < 0) { 1657 ut->c = 0; 1658 ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); 1659 } 1660 ut->p = ut->pExtra; 1661 ut->q = (char *)ut->pExtra + sizeof(UTF8Buf); 1662 return ut; 1663 1664 } 1665 1666 1667 1668 1669 1670 1671 1672 1673 //------------------------------------------------------------------------------ 1674 // 1675 // UText implementation wrapper for Replaceable (read/write) 1676 // 1677 // Use of UText data members: 1678 // context pointer to Replaceable. 1679 // p pointer to Replaceable if it is owned by the UText. 1680 // 1681 //------------------------------------------------------------------------------ 1682 1683 1684 1685 // minimum chunk size for this implementation: 3 1686 // to allow for possible trimming for code point boundaries 1687 enum { REP_TEXT_CHUNK_SIZE=10 }; 1688 1689 struct ReplExtra { 1690 /* 1691 * Chunk UChars. 1692 * +1 to simplify filling with surrogate pair at the end. 1693 */ 1694 char16_t s[REP_TEXT_CHUNK_SIZE+1]; 1695 }; 1696 1697 1698 U_CDECL_BEGIN 1699 1700 static UText * U_CALLCONV 1701 repTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status) { 1702 // First do a generic shallow clone. Does everything needed for the UText struct itself. 1703 dest = shallowTextClone(dest, src, status); 1704 1705 // For deep clones, make a copy of the Replaceable. 1706 // The copied Replaceable storage is owned by the newly created UText clone. 1707 // A non-nullptr pointer in UText.p is the signal to the close() function to delete 1708 // it. 1709 // 1710 if (deep && U_SUCCESS(*status)) { 1711 const Replaceable *replSrc = (const Replaceable *)src->context; 1712 dest->context = replSrc->clone(); 1713 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT); 1714 1715 // with deep clone, the copy is writable, even when the source is not. 1716 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE); 1717 } 1718 return dest; 1719 } 1720 1721 1722 static void U_CALLCONV 1723 repTextClose(UText *ut) { 1724 // Most of the work of close is done by the generic UText framework close. 1725 // All that needs to be done here is delete the Replaceable if the UText 1726 // owns it. This occurs if the UText was created by cloning. 1727 if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) { 1728 Replaceable *rep = (Replaceable *)ut->context; 1729 delete rep; 1730 ut->context = nullptr; 1731 } 1732 } 1733 1734 1735 static int64_t U_CALLCONV 1736 repTextLength(UText *ut) { 1737 const Replaceable *replSrc = (const Replaceable *)ut->context; 1738 int32_t len = replSrc->length(); 1739 return len; 1740 } 1741 1742 1743 static UBool U_CALLCONV 1744 repTextAccess(UText *ut, int64_t index, UBool forward) { 1745 const Replaceable *rep=(const Replaceable *)ut->context; 1746 int32_t length=rep->length(); // Full length of the input text (bigger than a chunk) 1747 1748 // clip the requested index to the limits of the text. 1749 int32_t index32 = pinIndex(index, length); 1750 U_ASSERT(index<=INT32_MAX); 1751 1752 1753 /* 1754 * Compute start/limit boundaries around index, for a segment of text 1755 * to be extracted. 1756 * To allow for the possibility that our user gave an index to the trailing 1757 * half of a surrogate pair, we must request one extra preceding char16_t when 1758 * going in the forward direction. This will ensure that the buffer has the 1759 * entire code point at the specified index. 1760 */ 1761 if(forward) { 1762 1763 if (index32>=ut->chunkNativeStart && index32<ut->chunkNativeLimit) { 1764 // Buffer already contains the requested position. 1765 ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart); 1766 return true; 1767 } 1768 if (index32>=length && ut->chunkNativeLimit==length) { 1769 // Request for end of string, and buffer already extends up to it. 1770 // Can't get the data, but don't change the buffer. 1771 ut->chunkOffset = length - (int32_t)ut->chunkNativeStart; 1772 return false; 1773 } 1774 1775 ut->chunkNativeLimit = index + REP_TEXT_CHUNK_SIZE - 1; 1776 // Going forward, so we want to have the buffer with stuff at and beyond 1777 // the requested index. The -1 gets us one code point before the 1778 // requested index also, to handle the case of the index being on 1779 // a trail surrogate of a surrogate pair. 1780 if(ut->chunkNativeLimit > length) { 1781 ut->chunkNativeLimit = length; 1782 } 1783 // unless buffer ran off end, start is index-1. 1784 ut->chunkNativeStart = ut->chunkNativeLimit - REP_TEXT_CHUNK_SIZE; 1785 if(ut->chunkNativeStart < 0) { 1786 ut->chunkNativeStart = 0; 1787 } 1788 } else { 1789 // Reverse iteration. Fill buffer with data preceding the requested index. 1790 if (index32>ut->chunkNativeStart && index32<=ut->chunkNativeLimit) { 1791 // Requested position already in buffer. 1792 ut->chunkOffset = index32 - (int32_t)ut->chunkNativeStart; 1793 return true; 1794 } 1795 if (index32==0 && ut->chunkNativeStart==0) { 1796 // Request for start, buffer already begins at start. 1797 // No data, but keep the buffer as is. 1798 ut->chunkOffset = 0; 1799 return false; 1800 } 1801 1802 // Figure out the bounds of the chunk to extract for reverse iteration. 1803 // Need to worry about chunk not splitting surrogate pairs, and while still 1804 // containing the data we need. 1805 // Fix by requesting a chunk that includes an extra char16_t at the end. 1806 // If this turns out to be a lead surrogate, we can lop it off and still have 1807 // the data we wanted. 1808 ut->chunkNativeStart = index32 + 1 - REP_TEXT_CHUNK_SIZE; 1809 if (ut->chunkNativeStart < 0) { 1810 ut->chunkNativeStart = 0; 1811 } 1812 1813 ut->chunkNativeLimit = index32 + 1; 1814 if (ut->chunkNativeLimit > length) { 1815 ut->chunkNativeLimit = length; 1816 } 1817 } 1818 1819 // Extract the new chunk of text from the Replaceable source. 1820 ReplExtra *ex = (ReplExtra *)ut->pExtra; 1821 // UnicodeString with its buffer a writable alias to the chunk buffer 1822 UnicodeString buffer(ex->s, 0 /*buffer length*/, REP_TEXT_CHUNK_SIZE /*buffer capacity*/); 1823 rep->extractBetween((int32_t)ut->chunkNativeStart, (int32_t)ut->chunkNativeLimit, buffer); 1824 1825 ut->chunkContents = ex->s; 1826 ut->chunkLength = (int32_t)(ut->chunkNativeLimit - ut->chunkNativeStart); 1827 ut->chunkOffset = (int32_t)(index32 - ut->chunkNativeStart); 1828 1829 // Surrogate pairs from the input text must not span chunk boundaries. 1830 // If end of chunk could be the start of a surrogate, trim it off. 1831 if (ut->chunkNativeLimit < length && 1832 U16_IS_LEAD(ex->s[ut->chunkLength-1])) { 1833 ut->chunkLength--; 1834 ut->chunkNativeLimit--; 1835 if (ut->chunkOffset > ut->chunkLength) { 1836 ut->chunkOffset = ut->chunkLength; 1837 } 1838 } 1839 1840 // if the first char16_t in the chunk could be the trailing half of a surrogate pair, 1841 // trim it off. 1842 if(ut->chunkNativeStart>0 && U16_IS_TRAIL(ex->s[0])) { 1843 ++(ut->chunkContents); 1844 ++(ut->chunkNativeStart); 1845 --(ut->chunkLength); 1846 --(ut->chunkOffset); 1847 } 1848 1849 // adjust the index/chunkOffset to a code point boundary 1850 U16_SET_CP_START(ut->chunkContents, 0, ut->chunkOffset); 1851 1852 // Use fast indexing for get/setNativeIndex() 1853 ut->nativeIndexingLimit = ut->chunkLength; 1854 1855 return true; 1856 } 1857 1858 1859 1860 static int32_t U_CALLCONV 1861 repTextExtract(UText *ut, 1862 int64_t start, int64_t limit, 1863 char16_t *dest, int32_t destCapacity, 1864 UErrorCode *status) { 1865 const Replaceable *rep=(const Replaceable *)ut->context; 1866 int32_t length=rep->length(); 1867 1868 if(U_FAILURE(*status)) { 1869 return 0; 1870 } 1871 if(destCapacity<0 || (dest==nullptr && destCapacity>0)) { 1872 *status=U_ILLEGAL_ARGUMENT_ERROR; 1873 } 1874 if(start>limit) { 1875 *status=U_INDEX_OUTOFBOUNDS_ERROR; 1876 return 0; 1877 } 1878 1879 int32_t start32 = pinIndex(start, length); 1880 int32_t limit32 = pinIndex(limit, length); 1881 1882 // adjust start, limit if they point to trail half of surrogates 1883 if (start32<length && U16_IS_TRAIL(rep->charAt(start32)) && 1884 U_IS_SUPPLEMENTARY(rep->char32At(start32))){ 1885 start32--; 1886 } 1887 if (limit32<length && U16_IS_TRAIL(rep->charAt(limit32)) && 1888 U_IS_SUPPLEMENTARY(rep->char32At(limit32))){ 1889 limit32--; 1890 } 1891 1892 length=limit32-start32; 1893 if(length>destCapacity) { 1894 limit32 = start32 + destCapacity; 1895 } 1896 UnicodeString buffer(dest, 0, destCapacity); // writable alias 1897 rep->extractBetween(start32, limit32, buffer); 1898 repTextAccess(ut, limit32, true); 1899 1900 return u_terminateUChars(dest, destCapacity, length, status); 1901 } 1902 1903 static int32_t U_CALLCONV 1904 repTextReplace(UText *ut, 1905 int64_t start, int64_t limit, 1906 const char16_t *src, int32_t length, 1907 UErrorCode *status) { 1908 Replaceable *rep=(Replaceable *)ut->context; 1909 int32_t oldLength; 1910 1911 if(U_FAILURE(*status)) { 1912 return 0; 1913 } 1914 if(src==nullptr && length!=0) { 1915 *status=U_ILLEGAL_ARGUMENT_ERROR; 1916 return 0; 1917 } 1918 oldLength=rep->length(); // will subtract from new length 1919 if(start>limit ) { 1920 *status=U_INDEX_OUTOFBOUNDS_ERROR; 1921 return 0; 1922 } 1923 1924 int32_t start32 = pinIndex(start, oldLength); 1925 int32_t limit32 = pinIndex(limit, oldLength); 1926 1927 // Snap start & limit to code point boundaries. 1928 if (start32<oldLength && U16_IS_TRAIL(rep->charAt(start32)) && 1929 start32>0 && U16_IS_LEAD(rep->charAt(start32-1))) 1930 { 1931 start32--; 1932 } 1933 if (limit32<oldLength && U16_IS_LEAD(rep->charAt(limit32-1)) && 1934 U16_IS_TRAIL(rep->charAt(limit32))) 1935 { 1936 limit32++; 1937 } 1938 1939 // Do the actual replace operation using methods of the Replaceable class 1940 UnicodeString replStr(length < 0, src, length); // read-only alias 1941 rep->handleReplaceBetween(start32, limit32, replStr); 1942 int32_t newLength = rep->length(); 1943 int32_t lengthDelta = newLength - oldLength; 1944 1945 // Is the UText chunk buffer OK? 1946 if (ut->chunkNativeLimit > start32) { 1947 // this replace operation may have impacted the current chunk. 1948 // invalidate it, which will force a reload on the next access. 1949 invalidateChunk(ut); 1950 } 1951 1952 // set the iteration position to the end of the newly inserted replacement text. 1953 int32_t newIndexPos = limit32 + lengthDelta; 1954 repTextAccess(ut, newIndexPos, true); 1955 1956 return lengthDelta; 1957 } 1958 1959 1960 static void U_CALLCONV 1961 repTextCopy(UText *ut, 1962 int64_t start, int64_t limit, 1963 int64_t destIndex, 1964 UBool move, 1965 UErrorCode *status) 1966 { 1967 Replaceable *rep=(Replaceable *)ut->context; 1968 int32_t length=rep->length(); 1969 1970 if(U_FAILURE(*status)) { 1971 return; 1972 } 1973 if (start>limit || (start<destIndex && destIndex<limit)) 1974 { 1975 *status=U_INDEX_OUTOFBOUNDS_ERROR; 1976 return; 1977 } 1978 1979 int32_t start32 = pinIndex(start, length); 1980 int32_t limit32 = pinIndex(limit, length); 1981 int32_t destIndex32 = pinIndex(destIndex, length); 1982 1983 // TODO: snap input parameters to code point boundaries. 1984 1985 if(move) { 1986 // move: copy to destIndex, then replace original with nothing 1987 int32_t segLength=limit32-start32; 1988 rep->copy(start32, limit32, destIndex32); 1989 if(destIndex32<start32) { 1990 start32+=segLength; 1991 limit32+=segLength; 1992 } 1993 rep->handleReplaceBetween(start32, limit32, UnicodeString()); 1994 } else { 1995 // copy 1996 rep->copy(start32, limit32, destIndex32); 1997 } 1998 1999 // If the change to the text touched the region in the chunk buffer, 2000 // invalidate the buffer. 2001 int32_t firstAffectedIndex = destIndex32; 2002 if (move && start32<firstAffectedIndex) { 2003 firstAffectedIndex = start32; 2004 } 2005 if (firstAffectedIndex < ut->chunkNativeLimit) { 2006 // changes may have affected range covered by the chunk 2007 invalidateChunk(ut); 2008 } 2009 2010 // Put iteration position at the newly inserted (moved) block, 2011 int32_t nativeIterIndex = destIndex32 + limit32 - start32; 2012 if (move && destIndex32>start32) { 2013 // moved a block of text towards the end of the string. 2014 nativeIterIndex = destIndex32; 2015 } 2016 2017 // Set position, reload chunk if needed. 2018 repTextAccess(ut, nativeIterIndex, true); 2019 } 2020 2021 static const struct UTextFuncs repFuncs = 2022 { 2023 sizeof(UTextFuncs), 2024 0, 0, 0, // Reserved alignment padding 2025 repTextClone, 2026 repTextLength, 2027 repTextAccess, 2028 repTextExtract, 2029 repTextReplace, 2030 repTextCopy, 2031 nullptr, // MapOffsetToNative, 2032 nullptr, // MapIndexToUTF16, 2033 repTextClose, 2034 nullptr, // spare 1 2035 nullptr, // spare 2 2036 nullptr // spare 3 2037 }; 2038 2039 2040 U_CAPI UText * U_EXPORT2 2041 utext_openReplaceable(UText *ut, Replaceable *rep, UErrorCode *status) 2042 { 2043 if(U_FAILURE(*status)) { 2044 return nullptr; 2045 } 2046 if(rep==nullptr) { 2047 *status=U_ILLEGAL_ARGUMENT_ERROR; 2048 return nullptr; 2049 } 2050 ut = utext_setup(ut, sizeof(ReplExtra), status); 2051 if(U_FAILURE(*status)) { 2052 return ut; 2053 } 2054 2055 ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_WRITABLE); 2056 if(rep->hasMetaData()) { 2057 ut->providerProperties |=I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA); 2058 } 2059 2060 ut->pFuncs = &repFuncs; 2061 ut->context = rep; 2062 return ut; 2063 } 2064 2065 U_CDECL_END 2066 2067 2068 2069 2070 2071 2072 2073 2074 //------------------------------------------------------------------------------ 2075 // 2076 // UText implementation for UnicodeString (read/write) and 2077 // for const UnicodeString (read only) 2078 // (same implementation, only the flags are different) 2079 // 2080 // Use of UText data members: 2081 // context pointer to UnicodeString 2082 // p pointer to UnicodeString IF this UText owns the string 2083 // and it must be deleted on close(). nullptr otherwise. 2084 // 2085 //------------------------------------------------------------------------------ 2086 2087 U_CDECL_BEGIN 2088 2089 2090 static UText * U_CALLCONV 2091 unistrTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status) { 2092 // First do a generic shallow clone. Does everything needed for the UText struct itself. 2093 dest = shallowTextClone(dest, src, status); 2094 2095 // For deep clones, make a copy of the UnicodeSring. 2096 // The copied UnicodeString storage is owned by the newly created UText clone. 2097 // A non-nullptr pointer in UText.p is the signal to the close() function to delete 2098 // the UText. 2099 // 2100 if (deep && U_SUCCESS(*status)) { 2101 const UnicodeString *srcString = (const UnicodeString *)src->context; 2102 dest->context = new UnicodeString(*srcString); 2103 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT); 2104 2105 // with deep clone, the copy is writable, even when the source is not. 2106 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE); 2107 } 2108 return dest; 2109 } 2110 2111 static void U_CALLCONV 2112 unistrTextClose(UText *ut) { 2113 // Most of the work of close is done by the generic UText framework close. 2114 // All that needs to be done here is delete the UnicodeString if the UText 2115 // owns it. This occurs if the UText was created by cloning. 2116 if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) { 2117 UnicodeString *str = (UnicodeString *)ut->context; 2118 delete str; 2119 ut->context = nullptr; 2120 } 2121 } 2122 2123 2124 static int64_t U_CALLCONV 2125 unistrTextLength(UText *t) { 2126 return ((const UnicodeString *)t->context)->length(); 2127 } 2128 2129 2130 static UBool U_CALLCONV 2131 unistrTextAccess(UText *ut, int64_t index, UBool forward) { 2132 int32_t length = ut->chunkLength; 2133 ut->chunkOffset = pinIndex(index, length); 2134 2135 // Check whether request is at the start or end 2136 UBool retVal = (forward && index<length) || (!forward && index>0); 2137 return retVal; 2138 } 2139 2140 2141 2142 static int32_t U_CALLCONV 2143 unistrTextExtract(UText *t, 2144 int64_t start, int64_t limit, 2145 char16_t *dest, int32_t destCapacity, 2146 UErrorCode *pErrorCode) { 2147 const UnicodeString *us=(const UnicodeString *)t->context; 2148 int32_t length=us->length(); 2149 2150 if(U_FAILURE(*pErrorCode)) { 2151 return 0; 2152 } 2153 if(destCapacity<0 || (dest==nullptr && destCapacity>0)) { 2154 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 2155 } 2156 if(start<0 || start>limit) { 2157 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 2158 return 0; 2159 } 2160 2161 int32_t start32 = start<length ? us->getChar32Start((int32_t)start) : length; 2162 int32_t limit32 = limit<length ? us->getChar32Start((int32_t)limit) : length; 2163 2164 length=limit32-start32; 2165 if (destCapacity>0 && dest!=nullptr) { 2166 int32_t trimmedLength = length; 2167 if(trimmedLength>destCapacity) { 2168 trimmedLength=destCapacity; 2169 } 2170 us->extract(start32, trimmedLength, dest); 2171 t->chunkOffset = start32+trimmedLength; 2172 } else { 2173 t->chunkOffset = start32; 2174 } 2175 u_terminateUChars(dest, destCapacity, length, pErrorCode); 2176 return length; 2177 } 2178 2179 static int32_t U_CALLCONV 2180 unistrTextReplace(UText *ut, 2181 int64_t start, int64_t limit, 2182 const char16_t *src, int32_t length, 2183 UErrorCode *pErrorCode) { 2184 UnicodeString *us=(UnicodeString *)ut->context; 2185 int32_t oldLength; 2186 2187 if(U_FAILURE(*pErrorCode)) { 2188 return 0; 2189 } 2190 if(src==nullptr && length!=0) { 2191 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 2192 } 2193 if(start>limit) { 2194 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 2195 return 0; 2196 } 2197 oldLength=us->length(); 2198 int32_t start32 = pinIndex(start, oldLength); 2199 int32_t limit32 = pinIndex(limit, oldLength); 2200 if (start32 < oldLength) { 2201 start32 = us->getChar32Start(start32); 2202 } 2203 if (limit32 < oldLength) { 2204 limit32 = us->getChar32Start(limit32); 2205 } 2206 2207 // replace 2208 us->replace(start32, limit32-start32, src, length); 2209 int32_t newLength = us->length(); 2210 2211 // Update the chunk description. 2212 ut->chunkContents = us->getBuffer(); 2213 ut->chunkLength = newLength; 2214 ut->chunkNativeLimit = newLength; 2215 ut->nativeIndexingLimit = newLength; 2216 2217 // Set iteration position to the point just following the newly inserted text. 2218 int32_t lengthDelta = newLength - oldLength; 2219 ut->chunkOffset = limit32 + lengthDelta; 2220 2221 return lengthDelta; 2222 } 2223 2224 static void U_CALLCONV 2225 unistrTextCopy(UText *ut, 2226 int64_t start, int64_t limit, 2227 int64_t destIndex, 2228 UBool move, 2229 UErrorCode *pErrorCode) { 2230 UnicodeString *us=(UnicodeString *)ut->context; 2231 int32_t length=us->length(); 2232 2233 if(U_FAILURE(*pErrorCode)) { 2234 return; 2235 } 2236 int32_t start32 = pinIndex(start, length); 2237 int32_t limit32 = pinIndex(limit, length); 2238 int32_t destIndex32 = pinIndex(destIndex, length); 2239 2240 if( start32>limit32 || (start32<destIndex32 && destIndex32<limit32)) { 2241 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 2242 return; 2243 } 2244 2245 if(move) { 2246 // move: copy to destIndex, then remove original 2247 int32_t segLength=limit32-start32; 2248 us->copy(start32, limit32, destIndex32); 2249 if(destIndex32<start32) { 2250 start32+=segLength; 2251 } 2252 us->remove(start32, segLength); 2253 } else { 2254 // copy 2255 us->copy(start32, limit32, destIndex32); 2256 } 2257 2258 // update chunk description, set iteration position. 2259 ut->chunkContents = us->getBuffer(); 2260 if (move==false) { 2261 // copy operation, string length grows 2262 ut->chunkLength += limit32-start32; 2263 ut->chunkNativeLimit = ut->chunkLength; 2264 ut->nativeIndexingLimit = ut->chunkLength; 2265 } 2266 2267 // Iteration position to end of the newly inserted text. 2268 ut->chunkOffset = destIndex32+limit32-start32; 2269 if (move && destIndex32>start32) { 2270 ut->chunkOffset = destIndex32; 2271 } 2272 2273 } 2274 2275 static const struct UTextFuncs unistrFuncs = 2276 { 2277 sizeof(UTextFuncs), 2278 0, 0, 0, // Reserved alignment padding 2279 unistrTextClone, 2280 unistrTextLength, 2281 unistrTextAccess, 2282 unistrTextExtract, 2283 unistrTextReplace, 2284 unistrTextCopy, 2285 nullptr, // MapOffsetToNative, 2286 nullptr, // MapIndexToUTF16, 2287 unistrTextClose, 2288 nullptr, // spare 1 2289 nullptr, // spare 2 2290 nullptr // spare 3 2291 }; 2292 2293 2294 2295 U_CDECL_END 2296 2297 2298 U_CAPI UText * U_EXPORT2 2299 utext_openUnicodeString(UText *ut, UnicodeString *s, UErrorCode *status) { 2300 ut = utext_openConstUnicodeString(ut, s, status); 2301 if (U_SUCCESS(*status)) { 2302 ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE); 2303 } 2304 return ut; 2305 } 2306 2307 2308 2309 U_CAPI UText * U_EXPORT2 2310 utext_openConstUnicodeString(UText *ut, const UnicodeString *s, UErrorCode *status) { 2311 if (U_SUCCESS(*status) && s->isBogus()) { 2312 // The UnicodeString is bogus, but we still need to detach the UText 2313 // from whatever it was hooked to before, if anything. 2314 utext_openUChars(ut, nullptr, 0, status); 2315 *status = U_ILLEGAL_ARGUMENT_ERROR; 2316 return ut; 2317 } 2318 ut = utext_setup(ut, 0, status); 2319 // note: use the standard (writable) function table for UnicodeString. 2320 // The flag settings disable writing, so having the functions in 2321 // the table is harmless. 2322 if (U_SUCCESS(*status)) { 2323 ut->pFuncs = &unistrFuncs; 2324 ut->context = s; 2325 ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS); 2326 ut->chunkContents = s->getBuffer(); 2327 ut->chunkLength = s->length(); 2328 ut->chunkNativeStart = 0; 2329 ut->chunkNativeLimit = ut->chunkLength; 2330 ut->nativeIndexingLimit = ut->chunkLength; 2331 } 2332 return ut; 2333 } 2334 2335 //------------------------------------------------------------------------------ 2336 // 2337 // UText implementation for const char16_t * strings 2338 // 2339 // Use of UText data members: 2340 // context pointer to UnicodeString 2341 // a length. -1 if not yet known. 2342 // 2343 // TODO: support 64 bit lengths. 2344 // 2345 //------------------------------------------------------------------------------ 2346 2347 U_CDECL_BEGIN 2348 2349 2350 static UText * U_CALLCONV 2351 ucstrTextClone(UText *dest, const UText * src, UBool deep, UErrorCode * status) { 2352 // First do a generic shallow clone. 2353 dest = shallowTextClone(dest, src, status); 2354 2355 // For deep clones, make a copy of the string. 2356 // The copied storage is owned by the newly created clone. 2357 // A non-nullptr pointer in UText.p is the signal to the close() function to delete 2358 // it. 2359 // 2360 if (deep && U_SUCCESS(*status)) { 2361 U_ASSERT(utext_nativeLength(dest) < INT32_MAX); 2362 int32_t len = (int32_t)utext_nativeLength(dest); 2363 2364 // The cloned string IS going to be NUL terminated, whether or not the original was. 2365 const char16_t *srcStr = (const char16_t *)src->context; 2366 char16_t *copyStr = (char16_t *)uprv_malloc((len+1) * sizeof(char16_t)); 2367 if (copyStr == nullptr) { 2368 *status = U_MEMORY_ALLOCATION_ERROR; 2369 } else { 2370 int64_t i; 2371 for (i=0; i<len; i++) { 2372 copyStr[i] = srcStr[i]; 2373 } 2374 copyStr[len] = 0; 2375 dest->context = copyStr; 2376 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT); 2377 } 2378 } 2379 return dest; 2380 } 2381 2382 2383 static void U_CALLCONV 2384 ucstrTextClose(UText *ut) { 2385 // Most of the work of close is done by the generic UText framework close. 2386 // All that needs to be done here is delete the string if the UText 2387 // owns it. This occurs if the UText was created by cloning. 2388 if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) { 2389 char16_t *s = (char16_t *)ut->context; 2390 uprv_free(s); 2391 ut->context = nullptr; 2392 } 2393 } 2394 2395 2396 2397 static int64_t U_CALLCONV 2398 ucstrTextLength(UText *ut) { 2399 if (ut->a < 0) { 2400 // null terminated, we don't yet know the length. Scan for it. 2401 // Access is not convenient for doing this 2402 // because the current iteration position can't be changed. 2403 const char16_t *str = (const char16_t *)ut->context; 2404 for (;;) { 2405 if (str[ut->chunkNativeLimit] == 0) { 2406 break; 2407 } 2408 ut->chunkNativeLimit++; 2409 } 2410 ut->a = ut->chunkNativeLimit; 2411 ut->chunkLength = (int32_t)ut->chunkNativeLimit; 2412 ut->nativeIndexingLimit = ut->chunkLength; 2413 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); 2414 } 2415 return ut->a; 2416 } 2417 2418 2419 static UBool U_CALLCONV 2420 ucstrTextAccess(UText *ut, int64_t index, UBool forward) { 2421 const char16_t *str = (const char16_t *)ut->context; 2422 2423 // pin the requested index to the bounds of the string, 2424 // and set current iteration position. 2425 if (index<0) { 2426 index = 0; 2427 } else if (index < ut->chunkNativeLimit) { 2428 // The request data is within the chunk as it is known so far. 2429 // Put index on a code point boundary. 2430 U16_SET_CP_START(str, 0, index); 2431 } else if (ut->a >= 0) { 2432 // We know the length of this string, and the user is requesting something 2433 // at or beyond the length. Pin the requested index to the length. 2434 index = ut->a; 2435 } else { 2436 // Null terminated string, length not yet known, and the requested index 2437 // is beyond where we have scanned so far. 2438 // Scan to 32 UChars beyond the requested index. The strategy here is 2439 // to avoid fully scanning a long string when the caller only wants to 2440 // see a few characters at its beginning. 2441 int32_t scanLimit = (int32_t)index + 32; 2442 if ((index + 32)>INT32_MAX || (index + 32)<0 ) { // note: int64 expression 2443 scanLimit = INT32_MAX; 2444 } 2445 2446 int32_t chunkLimit = (int32_t)ut->chunkNativeLimit; 2447 for (; chunkLimit<scanLimit; chunkLimit++) { 2448 if (str[chunkLimit] == 0) { 2449 // We found the end of the string. Remember it, pin the requested index to it, 2450 // and bail out of here. 2451 ut->a = chunkLimit; 2452 ut->chunkLength = chunkLimit; 2453 ut->nativeIndexingLimit = chunkLimit; 2454 if (index >= chunkLimit) { 2455 index = chunkLimit; 2456 } else { 2457 U16_SET_CP_START(str, 0, index); 2458 } 2459 2460 ut->chunkNativeLimit = chunkLimit; 2461 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); 2462 goto breakout; 2463 } 2464 } 2465 // We scanned through the next batch of UChars without finding the end. 2466 U16_SET_CP_START(str, 0, index); 2467 if (chunkLimit == INT32_MAX) { 2468 // Scanned to the limit of a 32 bit length. 2469 // Forceably trim the overlength string back so length fits in int32 2470 // TODO: add support for 64 bit strings. 2471 ut->a = chunkLimit; 2472 ut->chunkLength = chunkLimit; 2473 ut->nativeIndexingLimit = chunkLimit; 2474 if (index > chunkLimit) { 2475 index = chunkLimit; 2476 } 2477 ut->chunkNativeLimit = chunkLimit; 2478 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); 2479 } else { 2480 // The endpoint of a chunk must not be left in the middle of a surrogate pair. 2481 // If the current end is on a lead surrogate, back the end up by one. 2482 // It doesn't matter if the end char happens to be an unpaired surrogate, 2483 // and it's simpler not to worry about it. 2484 if (U16_IS_LEAD(str[chunkLimit-1])) { 2485 --chunkLimit; 2486 } 2487 // Null-terminated chunk with end still unknown. 2488 // Update the chunk length to reflect what has been scanned thus far. 2489 // That the full length is still unknown is (still) flagged by 2490 // ut->a being < 0. 2491 ut->chunkNativeLimit = chunkLimit; 2492 ut->nativeIndexingLimit = chunkLimit; 2493 ut->chunkLength = chunkLimit; 2494 } 2495 2496 } 2497 breakout: 2498 U_ASSERT(index<=INT32_MAX); 2499 ut->chunkOffset = (int32_t)index; 2500 2501 // Check whether request is at the start or end 2502 UBool retVal = (forward && index<ut->chunkNativeLimit) || (!forward && index>0); 2503 return retVal; 2504 } 2505 2506 2507 2508 static int32_t U_CALLCONV 2509 ucstrTextExtract(UText *ut, 2510 int64_t start, int64_t limit, 2511 char16_t *dest, int32_t destCapacity, 2512 UErrorCode *pErrorCode) 2513 { 2514 if(U_FAILURE(*pErrorCode)) { 2515 return 0; 2516 } 2517 if(destCapacity<0 || (dest==nullptr && destCapacity>0) || start>limit) { 2518 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 2519 return 0; 2520 } 2521 2522 //const char16_t *s=(const char16_t *)ut->context; 2523 int32_t si, di; 2524 2525 int32_t start32; 2526 int32_t limit32; 2527 2528 // Access the start. Does two things we need: 2529 // Pins 'start' to the length of the string, if it came in out-of-bounds. 2530 // Snaps 'start' to the beginning of a code point. 2531 ucstrTextAccess(ut, start, true); 2532 const char16_t *s=ut->chunkContents; 2533 start32 = ut->chunkOffset; 2534 2535 int32_t strLength=(int32_t)ut->a; 2536 if (strLength >= 0) { 2537 limit32 = pinIndex(limit, strLength); 2538 } else { 2539 limit32 = pinIndex(limit, INT32_MAX); 2540 } 2541 di = 0; 2542 for (si=start32; si<limit32; si++) { 2543 if (strLength<0 && s[si]==0) { 2544 // Just hit the end of a null-terminated string. 2545 ut->a = si; // set string length for this UText 2546 ut->chunkNativeLimit = si; 2547 ut->chunkLength = si; 2548 ut->nativeIndexingLimit = si; 2549 strLength = si; 2550 limit32 = si; 2551 break; 2552 } 2553 U_ASSERT(di>=0); /* to ensure di never exceeds INT32_MAX, which must not happen logically */ 2554 if (di<destCapacity) { 2555 // only store if there is space. 2556 dest[di] = s[si]; 2557 } else { 2558 if (strLength>=0) { 2559 // We have filled the destination buffer, and the string length is known. 2560 // Cut the loop short. There is no need to scan string termination. 2561 di = limit32 - start32; 2562 si = limit32; 2563 break; 2564 } 2565 } 2566 di++; 2567 } 2568 2569 // If the limit index points to a lead surrogate of a pair, 2570 // add the corresponding trail surrogate to the destination. 2571 if (si>0 && U16_IS_LEAD(s[si-1]) && 2572 ((si<strLength || strLength<0) && U16_IS_TRAIL(s[si]))) 2573 { 2574 if (di<destCapacity) { 2575 // store only if there is space in the output buffer. 2576 dest[di++] = s[si]; 2577 } 2578 si++; 2579 } 2580 2581 // Put iteration position at the point just following the extracted text 2582 if (si <= ut->chunkNativeLimit) { 2583 ut->chunkOffset = si; 2584 } else { 2585 ucstrTextAccess(ut, si, true); 2586 } 2587 2588 // Add a terminating NUL if space in the buffer permits, 2589 // and set the error status as required. 2590 u_terminateUChars(dest, destCapacity, di, pErrorCode); 2591 return di; 2592 } 2593 2594 static const struct UTextFuncs ucstrFuncs = 2595 { 2596 sizeof(UTextFuncs), 2597 0, 0, 0, // Reserved alignment padding 2598 ucstrTextClone, 2599 ucstrTextLength, 2600 ucstrTextAccess, 2601 ucstrTextExtract, 2602 nullptr, // Replace 2603 nullptr, // Copy 2604 nullptr, // MapOffsetToNative, 2605 nullptr, // MapIndexToUTF16, 2606 ucstrTextClose, 2607 nullptr, // spare 1 2608 nullptr, // spare 2 2609 nullptr, // spare 3 2610 }; 2611 2612 U_CDECL_END 2613 2614 static const char16_t gEmptyUString[] = {0}; 2615 2616 U_CAPI UText * U_EXPORT2 2617 utext_openUChars(UText *ut, const char16_t *s, int64_t length, UErrorCode *status) { 2618 if (U_FAILURE(*status)) { 2619 return nullptr; 2620 } 2621 if(s==nullptr && length==0) { 2622 s = gEmptyUString; 2623 } 2624 if (s==nullptr || length < -1 || length>INT32_MAX) { 2625 *status = U_ILLEGAL_ARGUMENT_ERROR; 2626 return nullptr; 2627 } 2628 ut = utext_setup(ut, 0, status); 2629 if (U_SUCCESS(*status)) { 2630 ut->pFuncs = &ucstrFuncs; 2631 ut->context = s; 2632 ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS); 2633 if (length==-1) { 2634 ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); 2635 } 2636 ut->a = length; 2637 ut->chunkContents = s; 2638 ut->chunkNativeStart = 0; 2639 ut->chunkNativeLimit = length>=0? length : 0; 2640 ut->chunkLength = (int32_t)ut->chunkNativeLimit; 2641 ut->chunkOffset = 0; 2642 ut->nativeIndexingLimit = ut->chunkLength; 2643 } 2644 return ut; 2645 } 2646 2647 2648 //------------------------------------------------------------------------------ 2649 // 2650 // UText implementation for text from ICU CharacterIterators 2651 // 2652 // Use of UText data members: 2653 // context pointer to the CharacterIterator 2654 // a length of the full text. 2655 // p pointer to buffer 1 2656 // b start index of local buffer 1 contents 2657 // q pointer to buffer 2 2658 // c start index of local buffer 2 contents 2659 // r pointer to the character iterator if the UText owns it. 2660 // Null otherwise. 2661 // 2662 //------------------------------------------------------------------------------ 2663 #define CIBufSize 16 2664 2665 U_CDECL_BEGIN 2666 static void U_CALLCONV 2667 charIterTextClose(UText *ut) { 2668 // Most of the work of close is done by the generic UText framework close. 2669 // All that needs to be done here is delete the CharacterIterator if the UText 2670 // owns it. This occurs if the UText was created by cloning. 2671 CharacterIterator *ci = (CharacterIterator *)ut->r; 2672 delete ci; 2673 ut->r = nullptr; 2674 } 2675 2676 static int64_t U_CALLCONV 2677 charIterTextLength(UText *ut) { 2678 return (int32_t)ut->a; 2679 } 2680 2681 static UBool U_CALLCONV 2682 charIterTextAccess(UText *ut, int64_t index, UBool forward) { 2683 CharacterIterator *ci = (CharacterIterator *)ut->context; 2684 2685 int32_t clippedIndex = (int32_t)index; 2686 if (clippedIndex<0) { 2687 clippedIndex=0; 2688 } else if (clippedIndex>=ut->a) { 2689 clippedIndex=(int32_t)ut->a; 2690 } 2691 int32_t neededIndex = clippedIndex; 2692 if (!forward && neededIndex>0) { 2693 // reverse iteration, want the position just before what was asked for. 2694 neededIndex--; 2695 } else if (forward && neededIndex==ut->a && neededIndex>0) { 2696 // Forward iteration, don't ask for something past the end of the text. 2697 neededIndex--; 2698 } 2699 2700 // Find the native index of the start of the buffer containing what we want. 2701 neededIndex -= neededIndex % CIBufSize; 2702 2703 char16_t *buf = nullptr; 2704 UBool needChunkSetup = true; 2705 int i; 2706 if (ut->chunkNativeStart == neededIndex) { 2707 // The buffer we want is already the current chunk. 2708 needChunkSetup = false; 2709 } else if (ut->b == neededIndex) { 2710 // The first buffer (buffer p) has what we need. 2711 buf = (char16_t *)ut->p; 2712 } else if (ut->c == neededIndex) { 2713 // The second buffer (buffer q) has what we need. 2714 buf = (char16_t *)ut->q; 2715 } else { 2716 // Neither buffer already has what we need. 2717 // Load new data from the character iterator. 2718 // Use the buf that is not the current buffer. 2719 buf = (char16_t *)ut->p; 2720 if (ut->p == ut->chunkContents) { 2721 buf = (char16_t *)ut->q; 2722 } 2723 ci->setIndex(neededIndex); 2724 for (i=0; i<CIBufSize; i++) { 2725 buf[i] = ci->nextPostInc(); 2726 if (i+neededIndex > ut->a) { 2727 break; 2728 } 2729 } 2730 } 2731 2732 // We have a buffer with the data we need. 2733 // Set it up as the current chunk, if it wasn't already. 2734 if (needChunkSetup) { 2735 ut->chunkContents = buf; 2736 ut->chunkLength = CIBufSize; 2737 ut->chunkNativeStart = neededIndex; 2738 ut->chunkNativeLimit = neededIndex + CIBufSize; 2739 if (ut->chunkNativeLimit > ut->a) { 2740 ut->chunkNativeLimit = ut->a; 2741 ut->chunkLength = (int32_t)(ut->chunkNativeLimit)-(int32_t)(ut->chunkNativeStart); 2742 } 2743 ut->nativeIndexingLimit = ut->chunkLength; 2744 U_ASSERT(ut->chunkOffset>=0 && ut->chunkOffset<=CIBufSize); 2745 } 2746 ut->chunkOffset = clippedIndex - (int32_t)ut->chunkNativeStart; 2747 UBool success = (forward? ut->chunkOffset<ut->chunkLength : ut->chunkOffset>0); 2748 return success; 2749 } 2750 2751 static UText * U_CALLCONV 2752 charIterTextClone(UText *dest, const UText *src, UBool deep, UErrorCode * status) { 2753 if (U_FAILURE(*status)) { 2754 return nullptr; 2755 } 2756 2757 if (deep) { 2758 // There is no CharacterIterator API for cloning the underlying text storage. 2759 *status = U_UNSUPPORTED_ERROR; 2760 return nullptr; 2761 } else { 2762 CharacterIterator *srcCI =(CharacterIterator *)src->context; 2763 srcCI = srcCI->clone(); 2764 dest = utext_openCharacterIterator(dest, srcCI, status); 2765 if (U_FAILURE(*status)) { 2766 return dest; 2767 } 2768 // cast off const on getNativeIndex. 2769 // For CharacterIterator based UTexts, this is safe, the operation is const. 2770 int64_t ix = utext_getNativeIndex((UText *)src); 2771 utext_setNativeIndex(dest, ix); 2772 dest->r = srcCI; // flags that this UText owns the CharacterIterator 2773 } 2774 return dest; 2775 } 2776 2777 static int32_t U_CALLCONV 2778 charIterTextExtract(UText *ut, 2779 int64_t start, int64_t limit, 2780 char16_t *dest, int32_t destCapacity, 2781 UErrorCode *status) 2782 { 2783 if(U_FAILURE(*status)) { 2784 return 0; 2785 } 2786 if(destCapacity<0 || (dest==nullptr && destCapacity>0) || start>limit) { 2787 *status=U_ILLEGAL_ARGUMENT_ERROR; 2788 return 0; 2789 } 2790 int32_t length = (int32_t)ut->a; 2791 int32_t start32 = pinIndex(start, length); 2792 int32_t limit32 = pinIndex(limit, length); 2793 int32_t desti = 0; 2794 int32_t srci; 2795 int32_t copyLimit; 2796 2797 CharacterIterator *ci = (CharacterIterator *)ut->context; 2798 ci->setIndex32(start32); // Moves ix to lead of surrogate pair, if needed. 2799 srci = ci->getIndex(); 2800 copyLimit = srci; 2801 while (srci<limit32) { 2802 UChar32 c = ci->next32PostInc(); 2803 int32_t len = U16_LENGTH(c); 2804 U_ASSERT(desti+len>0); /* to ensure desti+len never exceeds MAX_INT32, which must not happen logically */ 2805 if (desti+len <= destCapacity) { 2806 U16_APPEND_UNSAFE(dest, desti, c); 2807 copyLimit = srci+len; 2808 } else { 2809 desti += len; 2810 *status = U_BUFFER_OVERFLOW_ERROR; 2811 } 2812 srci += len; 2813 } 2814 2815 charIterTextAccess(ut, copyLimit, true); 2816 2817 u_terminateUChars(dest, destCapacity, desti, status); 2818 return desti; 2819 } 2820 2821 static const struct UTextFuncs charIterFuncs = 2822 { 2823 sizeof(UTextFuncs), 2824 0, 0, 0, // Reserved alignment padding 2825 charIterTextClone, 2826 charIterTextLength, 2827 charIterTextAccess, 2828 charIterTextExtract, 2829 nullptr, // Replace 2830 nullptr, // Copy 2831 nullptr, // MapOffsetToNative, 2832 nullptr, // MapIndexToUTF16, 2833 charIterTextClose, 2834 nullptr, // spare 1 2835 nullptr, // spare 2 2836 nullptr // spare 3 2837 }; 2838 U_CDECL_END 2839 2840 2841 U_CAPI UText * U_EXPORT2 2842 utext_openCharacterIterator(UText *ut, CharacterIterator *ci, UErrorCode *status) { 2843 if (U_FAILURE(*status)) { 2844 return nullptr; 2845 } 2846 2847 if (ci->startIndex() > 0) { 2848 // No support for CharacterIterators that do not start indexing from zero. 2849 *status = U_UNSUPPORTED_ERROR; 2850 return nullptr; 2851 } 2852 2853 // Extra space in UText for 2 buffers of CIBufSize UChars each. 2854 int32_t extraSpace = 2 * CIBufSize * sizeof(char16_t); 2855 ut = utext_setup(ut, extraSpace, status); 2856 if (U_SUCCESS(*status)) { 2857 ut->pFuncs = &charIterFuncs; 2858 ut->context = ci; 2859 ut->providerProperties = 0; 2860 ut->a = ci->endIndex(); // Length of text 2861 ut->p = ut->pExtra; // First buffer 2862 ut->b = -1; // Native index of first buffer contents 2863 ut->q = (char16_t*)ut->pExtra+CIBufSize; // Second buffer 2864 ut->c = -1; // Native index of second buffer contents 2865 2866 // Initialize current chunk contents to be empty. 2867 // First access will fault something in. 2868 // Note: The initial nativeStart and chunkOffset must sum to zero 2869 // so that getNativeIndex() will correctly compute to zero 2870 // if no call to Access() has ever been made. They can't be both 2871 // zero without Access() thinking that the chunk is valid. 2872 ut->chunkContents = (char16_t *)ut->p; 2873 ut->chunkNativeStart = -1; 2874 ut->chunkOffset = 1; 2875 ut->chunkNativeLimit = 0; 2876 ut->chunkLength = 0; 2877 ut->nativeIndexingLimit = ut->chunkOffset; // enables native indexing 2878 } 2879 return ut; 2880 }