unistr.cpp (62125B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ****************************************************************************** 5 * Copyright (C) 1999-2016, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ****************************************************************************** 8 * 9 * File unistr.cpp 10 * 11 * Modification History: 12 * 13 * Date Name Description 14 * 09/25/98 stephen Creation. 15 * 04/20/99 stephen Overhauled per 4/16 code review. 16 * 07/09/99 stephen Renamed {hi,lo},{byte,word} to icu_X for HP/UX 17 * 11/18/99 aliu Added handleReplaceBetween() to make inherit from 18 * Replaceable. 19 * 06/25/01 grhoten Removed the dependency on iostream 20 ****************************************************************************** 21 */ 22 23 #include <string_view> 24 25 #include "unicode/utypes.h" 26 #include "unicode/appendable.h" 27 #include "unicode/putil.h" 28 #include "cstring.h" 29 #include "cmemory.h" 30 #include "unicode/ustring.h" 31 #include "unicode/unistr.h" 32 #include "unicode/utf.h" 33 #include "unicode/utf16.h" 34 #include "uelement.h" 35 #include "ustr_imp.h" 36 #include "umutex.h" 37 #include "uassert.h" 38 39 #if 0 40 41 #include <iostream> 42 using namespace std; 43 44 //DEBUGGING 45 void 46 print(const UnicodeString& s, 47 const char *name) 48 { 49 char16_t c; 50 cout << name << ":|"; 51 for(int i = 0; i < s.length(); ++i) { 52 c = s[i]; 53 if(c>= 0x007E || c < 0x0020) 54 cout << "[0x" << hex << s[i] << "]"; 55 else 56 cout << (char) s[i]; 57 } 58 cout << '|' << endl; 59 } 60 61 void 62 print(const char16_t *s, 63 int32_t len, 64 const char *name) 65 { 66 char16_t c; 67 cout << name << ":|"; 68 for(int i = 0; i < len; ++i) { 69 c = s[i]; 70 if(c>= 0x007E || c < 0x0020) 71 cout << "[0x" << hex << s[i] << "]"; 72 else 73 cout << (char) s[i]; 74 } 75 cout << '|' << endl; 76 } 77 // END DEBUGGING 78 #endif 79 80 // Local function definitions for now 81 82 // need to copy areas that may overlap 83 static 84 inline void 85 us_arrayCopy(const char16_t *src, int32_t srcStart, 86 char16_t *dst, int32_t dstStart, int32_t count) 87 { 88 if(count>0) { 89 uprv_memmove(dst+dstStart, src+srcStart, (size_t)count*sizeof(*src)); 90 } 91 } 92 93 // u_unescapeAt() callback to get a char16_t from a UnicodeString 94 U_CDECL_BEGIN 95 static char16_t U_CALLCONV 96 UnicodeString_charAt(int32_t offset, void *context) { 97 return ((icu::UnicodeString*) context)->charAt(offset); 98 } 99 U_CDECL_END 100 101 U_NAMESPACE_BEGIN 102 103 /* The Replaceable virtual destructor can't be defined in the header 104 due to how AIX works with multiple definitions of virtual functions. 105 */ 106 Replaceable::~Replaceable() {} 107 108 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString) 109 110 UnicodeString U_EXPORT2 111 operator+ (const UnicodeString &s1, const UnicodeString &s2) { 112 int32_t sumLengths; 113 if (uprv_add32_overflow(s1.length(), s2.length(), &sumLengths)) { 114 UnicodeString bogus; 115 bogus.setToBogus(); 116 return bogus; 117 } 118 if (sumLengths != INT32_MAX) { 119 ++sumLengths; // space for a terminating NUL if we need one 120 } 121 return UnicodeString(sumLengths, static_cast<UChar32>(0), 0).append(s1).append(s2); 122 } 123 124 U_COMMON_API UnicodeString U_EXPORT2 125 unistr_internalConcat(const UnicodeString &s1, std::u16string_view s2) { 126 int32_t sumLengths; 127 if (s2.length() > INT32_MAX || 128 uprv_add32_overflow(s1.length(), static_cast<int32_t>(s2.length()), &sumLengths)) { 129 UnicodeString bogus; 130 bogus.setToBogus(); 131 return bogus; 132 } 133 if (sumLengths != INT32_MAX) { 134 ++sumLengths; // space for a terminating NUL if we need one 135 } 136 return UnicodeString(sumLengths, static_cast<UChar32>(0), 0).append(s1).append(s2); 137 } 138 139 140 //======================================== 141 // Reference Counting functions, put at top of file so that optimizing compilers 142 // have a chance to automatically inline. 143 //======================================== 144 145 void 146 UnicodeString::addRef() { 147 umtx_atomic_inc(reinterpret_cast<u_atomic_int32_t*>(fUnion.fFields.fArray) - 1); 148 } 149 150 int32_t 151 UnicodeString::removeRef() { 152 return umtx_atomic_dec(reinterpret_cast<u_atomic_int32_t*>(fUnion.fFields.fArray) - 1); 153 } 154 155 int32_t 156 UnicodeString::refCount() const { 157 return umtx_loadAcquire(*(reinterpret_cast<u_atomic_int32_t*>(fUnion.fFields.fArray) - 1)); 158 } 159 160 void 161 UnicodeString::releaseArray() { 162 if((fUnion.fFields.fLengthAndFlags & kRefCounted) && removeRef() == 0) { 163 uprv_free(reinterpret_cast<int32_t*>(fUnion.fFields.fArray) - 1); 164 } 165 } 166 167 168 169 //======================================== 170 // Constructors 171 //======================================== 172 173 // The default constructor is inline in unistr.h. 174 175 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count) { 176 fUnion.fFields.fLengthAndFlags = 0; 177 if (count <= 0 || static_cast<uint32_t>(c) > 0x10ffff) { 178 // just allocate and do not do anything else 179 allocate(capacity); 180 } else if(c <= 0xffff) { 181 int32_t length = count; 182 if(capacity < length) { 183 capacity = length; 184 } 185 if(allocate(capacity)) { 186 char16_t *array = getArrayStart(); 187 char16_t unit = static_cast<char16_t>(c); 188 for(int32_t i = 0; i < length; ++i) { 189 array[i] = unit; 190 } 191 setLength(length); 192 } 193 } else { // supplementary code point, write surrogate pairs 194 if(count > (INT32_MAX / 2)) { 195 // We would get more than 2G UChars. 196 allocate(capacity); 197 return; 198 } 199 int32_t length = count * 2; 200 if(capacity < length) { 201 capacity = length; 202 } 203 if(allocate(capacity)) { 204 char16_t *array = getArrayStart(); 205 char16_t lead = U16_LEAD(c); 206 char16_t trail = U16_TRAIL(c); 207 for(int32_t i = 0; i < length; i += 2) { 208 array[i] = lead; 209 array[i + 1] = trail; 210 } 211 setLength(length); 212 } 213 } 214 } 215 216 UnicodeString::UnicodeString(char16_t ch) { 217 fUnion.fFields.fLengthAndFlags = kLength1 | kShortString; 218 fUnion.fStackFields.fBuffer[0] = ch; 219 } 220 221 UnicodeString::UnicodeString(UChar32 ch) { 222 fUnion.fFields.fLengthAndFlags = kShortString; 223 int32_t i = 0; 224 UBool isError = false; 225 U16_APPEND(fUnion.fStackFields.fBuffer, i, US_STACKBUF_SIZE, ch, isError); 226 // We test isError so that the compiler does not complain that we don't. 227 // If isError then i==0 which is what we want anyway. 228 if(!isError) { 229 setShortLength(i); 230 } 231 } 232 233 UnicodeString::UnicodeString(const char16_t *text, 234 int32_t textLength) { 235 fUnion.fFields.fLengthAndFlags = kShortString; 236 doAppend(text, 0, textLength); 237 } 238 239 UnicodeString::UnicodeString(UBool isTerminated, 240 ConstChar16Ptr textPtr, 241 int32_t textLength) { 242 fUnion.fFields.fLengthAndFlags = kReadonlyAlias; 243 const char16_t *text = textPtr; 244 if(text == nullptr) { 245 // treat as an empty string, do not alias 246 setToEmpty(); 247 } else if(textLength < -1 || 248 (textLength == -1 && !isTerminated) || 249 (textLength >= 0 && isTerminated && text[textLength] != 0) 250 ) { 251 setToBogus(); 252 } else { 253 if(textLength == -1) { 254 // text is terminated, or else it would have failed the above test 255 textLength = u_strlen(text); 256 } 257 setArray(const_cast<char16_t *>(text), textLength, 258 isTerminated ? textLength + 1 : textLength); 259 } 260 } 261 262 UnicodeString::UnicodeString(char16_t *buff, 263 int32_t buffLength, 264 int32_t buffCapacity) { 265 fUnion.fFields.fLengthAndFlags = kWritableAlias; 266 if(buff == nullptr) { 267 // treat as an empty string, do not alias 268 setToEmpty(); 269 } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) { 270 setToBogus(); 271 } else { 272 if(buffLength == -1) { 273 // fLength = u_strlen(buff); but do not look beyond buffCapacity 274 const char16_t *p = buff, *limit = buff + buffCapacity; 275 while(p != limit && *p != 0) { 276 ++p; 277 } 278 buffLength = static_cast<int32_t>(p - buff); 279 } 280 setArray(buff, buffLength, buffCapacity); 281 } 282 } 283 284 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant) { 285 fUnion.fFields.fLengthAndFlags = kShortString; 286 if(src==nullptr) { 287 // treat as an empty string 288 } else { 289 if(length<0) { 290 length = static_cast<int32_t>(uprv_strlen(src)); 291 } 292 if(cloneArrayIfNeeded(length, length, false)) { 293 u_charsToUChars(src, getArrayStart(), length); 294 setLength(length); 295 } else { 296 setToBogus(); 297 } 298 } 299 } 300 301 UnicodeString UnicodeString::readOnlyAliasFromU16StringView(std::u16string_view text) { 302 UnicodeString result; 303 if (text.length() <= INT32_MAX) { 304 result.setTo(false, text.data(), static_cast<int32_t>(text.length())); 305 } else { 306 result.setToBogus(); 307 } 308 return result; 309 } 310 311 UnicodeString UnicodeString::readOnlyAliasFromUnicodeString(const UnicodeString &text) { 312 UnicodeString result; 313 if (text.isBogus()) { 314 result.setToBogus(); 315 } else { 316 result.setTo(false, text.getBuffer(), text.length()); 317 } 318 return result; 319 } 320 321 #if U_CHARSET_IS_UTF8 322 323 UnicodeString::UnicodeString(const char *codepageData) { 324 fUnion.fFields.fLengthAndFlags = kShortString; 325 if (codepageData != nullptr) { 326 setToUTF8(codepageData); 327 } 328 } 329 330 UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength) { 331 fUnion.fFields.fLengthAndFlags = kShortString; 332 // if there's nothing to convert, do nothing 333 if (codepageData == nullptr || dataLength == 0 || dataLength < -1) { 334 return; 335 } 336 if(dataLength == -1) { 337 dataLength = static_cast<int32_t>(uprv_strlen(codepageData)); 338 } 339 setToUTF8(StringPiece(codepageData, dataLength)); 340 } 341 342 // else see unistr_cnv.cpp 343 #endif 344 345 UnicodeString::UnicodeString(const UnicodeString& that) { 346 fUnion.fFields.fLengthAndFlags = kShortString; 347 copyFrom(that); 348 } 349 350 UnicodeString::UnicodeString(UnicodeString &&src) noexcept { 351 copyFieldsFrom(src, true); 352 } 353 354 UnicodeString::UnicodeString(const UnicodeString& that, 355 int32_t srcStart) { 356 fUnion.fFields.fLengthAndFlags = kShortString; 357 setTo(that, srcStart); 358 } 359 360 UnicodeString::UnicodeString(const UnicodeString& that, 361 int32_t srcStart, 362 int32_t srcLength) { 363 fUnion.fFields.fLengthAndFlags = kShortString; 364 setTo(that, srcStart, srcLength); 365 } 366 367 // Replaceable base class clone() default implementation, does not clone 368 Replaceable * 369 Replaceable::clone() const { 370 return nullptr; 371 } 372 373 // UnicodeString overrides clone() with a real implementation 374 UnicodeString * 375 UnicodeString::clone() const { 376 LocalPointer<UnicodeString> clonedString(new UnicodeString(*this)); 377 return clonedString.isValid() && !clonedString->isBogus() ? clonedString.orphan() : nullptr; 378 } 379 380 //======================================== 381 // array allocation 382 //======================================== 383 384 namespace { 385 386 const int32_t kGrowSize = 128; 387 388 // The number of bytes for one int32_t reference counter and capacity UChars 389 // must fit into a 32-bit size_t (at least when on a 32-bit platform). 390 // We also add one for the NUL terminator, to avoid reallocation in getTerminatedBuffer(), 391 // and round up to a multiple of 16 bytes. 392 // This means that capacity must be at most (0xfffffff0 - 4) / 2 - 1 = 0x7ffffff5. 393 // (With more complicated checks we could go up to 0x7ffffffd without rounding up, 394 // but that does not seem worth it.) 395 const int32_t kMaxCapacity = 0x7ffffff5; 396 397 int32_t getGrowCapacity(int32_t newLength) { 398 int32_t growSize = (newLength >> 2) + kGrowSize; 399 if(growSize <= (kMaxCapacity - newLength)) { 400 return newLength + growSize; 401 } else { 402 return kMaxCapacity; 403 } 404 } 405 406 } // namespace 407 408 UBool 409 UnicodeString::allocate(int32_t capacity) { 410 if(capacity <= US_STACKBUF_SIZE) { 411 fUnion.fFields.fLengthAndFlags = kShortString; 412 return true; 413 } 414 if(capacity <= kMaxCapacity) { 415 ++capacity; // for the NUL 416 // Switch to size_t which is unsigned so that we can allocate up to 4GB. 417 // Reference counter + UChars. 418 size_t numBytes = sizeof(int32_t) + static_cast<size_t>(capacity) * U_SIZEOF_UCHAR; 419 // Round up to a multiple of 16. 420 numBytes = (numBytes + 15) & ~15; 421 int32_t* array = static_cast<int32_t*>(uprv_malloc(numBytes)); 422 if(array != nullptr) { 423 // set initial refCount and point behind the refCount 424 *array++ = 1; 425 numBytes -= sizeof(int32_t); 426 427 // have fArray point to the first char16_t 428 fUnion.fFields.fArray = reinterpret_cast<char16_t*>(array); 429 fUnion.fFields.fCapacity = static_cast<int32_t>(numBytes / U_SIZEOF_UCHAR); 430 fUnion.fFields.fLengthAndFlags = kLongString; 431 return true; 432 } 433 } 434 fUnion.fFields.fLengthAndFlags = kIsBogus; 435 fUnion.fFields.fArray = nullptr; 436 fUnion.fFields.fCapacity = 0; 437 return false; 438 } 439 440 //======================================== 441 // Destructor 442 //======================================== 443 444 #ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS 445 static u_atomic_int32_t finalLengthCounts[0x400]; // UnicodeString::kMaxShortLength+1 446 static u_atomic_int32_t beyondCount(0); 447 448 U_CAPI void unistr_printLengths() { 449 int32_t i; 450 for(i = 0; i <= 59; ++i) { 451 printf("%2d, %9d\n", i, (int32_t)finalLengthCounts[i]); 452 } 453 int32_t beyond = beyondCount; 454 for(; i < UPRV_LENGTHOF(finalLengthCounts); ++i) { 455 beyond += finalLengthCounts[i]; 456 } 457 printf(">59, %9d\n", beyond); 458 } 459 #endif 460 461 UnicodeString::~UnicodeString() 462 { 463 #ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS 464 // Count lengths of strings at the end of their lifetime. 465 // Useful for discussion of a desirable stack buffer size. 466 // Count the contents length, not the optional NUL terminator nor further capacity. 467 // Ignore open-buffer strings and strings which alias external storage. 468 if((fUnion.fFields.fLengthAndFlags&(kOpenGetBuffer|kReadonlyAlias|kWritableAlias)) == 0) { 469 if(hasShortLength()) { 470 umtx_atomic_inc(finalLengthCounts + getShortLength()); 471 } else { 472 umtx_atomic_inc(&beyondCount); 473 } 474 } 475 #endif 476 477 releaseArray(); 478 } 479 480 //======================================== 481 // Factory methods 482 //======================================== 483 484 UnicodeString UnicodeString::fromUTF8(StringPiece utf8) { 485 UnicodeString result; 486 result.setToUTF8(utf8); 487 return result; 488 } 489 490 UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) { 491 UnicodeString result; 492 int32_t capacity; 493 // Most UTF-32 strings will be BMP-only and result in a same-length 494 // UTF-16 string. We overestimate the capacity just slightly, 495 // just in case there are a few supplementary characters. 496 if(length <= US_STACKBUF_SIZE) { 497 capacity = US_STACKBUF_SIZE; 498 } else { 499 capacity = length + (length >> 4) + 4; 500 } 501 do { 502 char16_t *utf16 = result.getBuffer(capacity); 503 int32_t length16; 504 UErrorCode errorCode = U_ZERO_ERROR; 505 u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16, 506 utf32, length, 507 0xfffd, // Substitution character. 508 nullptr, // Don't care about number of substitutions. 509 &errorCode); 510 result.releaseBuffer(length16); 511 if(errorCode == U_BUFFER_OVERFLOW_ERROR) { 512 capacity = length16 + 1; // +1 for the terminating NUL. 513 continue; 514 } else if(U_FAILURE(errorCode)) { 515 result.setToBogus(); 516 } 517 break; 518 } while(true); 519 return result; 520 } 521 522 //======================================== 523 // Assignment 524 //======================================== 525 526 UnicodeString & 527 UnicodeString::operator=(const UnicodeString &src) { 528 return copyFrom(src); 529 } 530 531 UnicodeString & 532 UnicodeString::fastCopyFrom(const UnicodeString &src) { 533 return copyFrom(src, true); 534 } 535 536 UnicodeString & 537 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) { 538 // if assigning to ourselves, do nothing 539 if(this == &src) { 540 return *this; 541 } 542 543 // is the right side bogus? 544 if(src.isBogus()) { 545 setToBogus(); 546 return *this; 547 } 548 549 // delete the current contents 550 releaseArray(); 551 552 if(src.isEmpty()) { 553 // empty string - use the stack buffer 554 setToEmpty(); 555 return *this; 556 } 557 558 // fLength>0 and not an "open" src.getBuffer(minCapacity) 559 fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags; 560 switch(src.fUnion.fFields.fLengthAndFlags & kAllStorageFlags) { 561 case kShortString: 562 // short string using the stack buffer, do the same 563 uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer, 564 getShortLength() * U_SIZEOF_UCHAR); 565 break; 566 case kLongString: 567 // src uses a refCounted string buffer, use that buffer with refCount 568 // src is const, use a cast - we don't actually change it 569 const_cast<UnicodeString &>(src).addRef(); 570 // copy all fields, share the reference-counted buffer 571 fUnion.fFields.fArray = src.fUnion.fFields.fArray; 572 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity; 573 if(!hasShortLength()) { 574 fUnion.fFields.fLength = src.fUnion.fFields.fLength; 575 } 576 break; 577 case kReadonlyAlias: 578 if(fastCopy) { 579 // src is a readonly alias, do the same 580 // -> maintain the readonly alias as such 581 fUnion.fFields.fArray = src.fUnion.fFields.fArray; 582 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity; 583 if(!hasShortLength()) { 584 fUnion.fFields.fLength = src.fUnion.fFields.fLength; 585 } 586 break; 587 } 588 // else if(!fastCopy) fall through to case kWritableAlias 589 // -> allocate a new buffer and copy the contents 590 U_FALLTHROUGH; 591 case kWritableAlias: { 592 // src is a writable alias; we make a copy of that instead 593 int32_t srcLength = src.length(); 594 if(allocate(srcLength)) { 595 u_memcpy(getArrayStart(), src.getArrayStart(), srcLength); 596 setLength(srcLength); 597 break; 598 } 599 // if there is not enough memory, then fall through to setting to bogus 600 U_FALLTHROUGH; 601 } 602 default: 603 // if src is bogus, set ourselves to bogus 604 // do not call setToBogus() here because fArray and flags are not consistent here 605 fUnion.fFields.fLengthAndFlags = kIsBogus; 606 fUnion.fFields.fArray = nullptr; 607 fUnion.fFields.fCapacity = 0; 608 break; 609 } 610 611 return *this; 612 } 613 614 UnicodeString &UnicodeString::operator=(UnicodeString &&src) noexcept { 615 // No explicit check for self move assignment, consistent with standard library. 616 // Self move assignment causes no crash nor leak but might make the object bogus. 617 releaseArray(); 618 copyFieldsFrom(src, true); 619 return *this; 620 } 621 622 // Same as move assignment except without memory management. 623 void UnicodeString::copyFieldsFrom(UnicodeString &src, UBool setSrcToBogus) noexcept { 624 int16_t lengthAndFlags = fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags; 625 if(lengthAndFlags & kUsingStackBuffer) { 626 // Short string using the stack buffer, copy the contents. 627 // Check for self assignment to prevent "overlap in memcpy" warnings, 628 // although it should be harmless to copy a buffer to itself exactly. 629 if(this != &src) { 630 uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer, 631 getShortLength() * U_SIZEOF_UCHAR); 632 } 633 } else { 634 // In all other cases, copy all fields. 635 fUnion.fFields.fArray = src.fUnion.fFields.fArray; 636 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity; 637 if(!hasShortLength()) { 638 fUnion.fFields.fLength = src.fUnion.fFields.fLength; 639 } 640 if(setSrcToBogus) { 641 // Set src to bogus without releasing any memory. 642 src.fUnion.fFields.fLengthAndFlags = kIsBogus; 643 src.fUnion.fFields.fArray = nullptr; 644 src.fUnion.fFields.fCapacity = 0; 645 } 646 } 647 } 648 649 void UnicodeString::swap(UnicodeString &other) noexcept { 650 UnicodeString temp; // Empty short string: Known not to need releaseArray(). 651 // Copy fields without resetting source values in between. 652 temp.copyFieldsFrom(*this, false); 653 this->copyFieldsFrom(other, false); 654 other.copyFieldsFrom(temp, false); 655 // Set temp to an empty string so that other's memory is not released twice. 656 temp.fUnion.fFields.fLengthAndFlags = kShortString; 657 } 658 659 //======================================== 660 // Miscellaneous operations 661 //======================================== 662 663 UnicodeString UnicodeString::unescape() const { 664 UnicodeString result(length(), static_cast<UChar32>(0), static_cast<int32_t>(0)); // construct with capacity 665 if (result.isBogus()) { 666 return result; 667 } 668 const char16_t *array = getBuffer(); 669 int32_t len = length(); 670 int32_t prev = 0; 671 for (int32_t i=0;;) { 672 if (i == len) { 673 result.append(array, prev, len - prev); 674 break; 675 } 676 if (array[i++] == 0x5C /*'\\'*/) { 677 result.append(array, prev, (i - 1) - prev); 678 UChar32 c = unescapeAt(i); // advances i 679 if (c < 0) { 680 result.remove(); // return empty string 681 break; // invalid escape sequence 682 } 683 result.append(c); 684 prev = i; 685 } 686 } 687 return result; 688 } 689 690 UChar32 UnicodeString::unescapeAt(int32_t &offset) const { 691 return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this); 692 } 693 694 //======================================== 695 // Read-only implementation 696 //======================================== 697 UBool 698 UnicodeString::doEquals(const char16_t *text, int32_t len) const { 699 // Requires: this not bogus and have same lengths. 700 // Byte-wise comparison works for equality regardless of endianness. 701 return uprv_memcmp(getArrayStart(), text, len * U_SIZEOF_UCHAR) == 0; 702 } 703 704 UBool 705 UnicodeString::doEqualsSubstring( int32_t start, 706 int32_t length, 707 const char16_t *srcChars, 708 int32_t srcStart, 709 int32_t srcLength) const 710 { 711 // compare illegal string values 712 if(isBogus()) { 713 return false; 714 } 715 716 // pin indices to legal values 717 pinIndices(start, length); 718 719 if(srcChars == nullptr) { 720 // treat const char16_t *srcChars==nullptr as an empty string 721 return length == 0 ? true : false; 722 } 723 724 // get the correct pointer 725 const char16_t *chars = getArrayStart(); 726 727 chars += start; 728 srcChars += srcStart; 729 730 // get the srcLength if necessary 731 if(srcLength < 0) { 732 srcLength = u_strlen(srcChars + srcStart); 733 } 734 735 if (length != srcLength) { 736 return false; 737 } 738 739 if(length == 0 || chars == srcChars) { 740 return true; 741 } 742 743 return u_memcmp(chars, srcChars, srcLength) == 0; 744 } 745 746 int8_t 747 UnicodeString::doCompare( int32_t start, 748 int32_t length, 749 const char16_t *srcChars, 750 int32_t srcStart, 751 int32_t srcLength) const 752 { 753 // compare illegal string values 754 if(isBogus()) { 755 return -1; 756 } 757 758 // pin indices to legal values 759 pinIndices(start, length); 760 761 if(srcChars == nullptr) { 762 // treat const char16_t *srcChars==nullptr as an empty string 763 return length == 0 ? 0 : 1; 764 } 765 766 // get the correct pointer 767 const char16_t *chars = getArrayStart(); 768 769 chars += start; 770 srcChars += srcStart; 771 772 int32_t minLength; 773 int8_t lengthResult; 774 775 // get the srcLength if necessary 776 if(srcLength < 0) { 777 srcLength = u_strlen(srcChars + srcStart); 778 } 779 780 // are we comparing different lengths? 781 if(length != srcLength) { 782 if(length < srcLength) { 783 minLength = length; 784 lengthResult = -1; 785 } else { 786 minLength = srcLength; 787 lengthResult = 1; 788 } 789 } else { 790 minLength = length; 791 lengthResult = 0; 792 } 793 794 /* 795 * note that uprv_memcmp() returns an int but we return an int8_t; 796 * we need to take care not to truncate the result - 797 * one way to do this is to right-shift the value to 798 * move the sign bit into the lower 8 bits and making sure that this 799 * does not become 0 itself 800 */ 801 802 if(minLength > 0 && chars != srcChars) { 803 int32_t result; 804 805 # if U_IS_BIG_ENDIAN 806 // big-endian: byte comparison works 807 result = uprv_memcmp(chars, srcChars, minLength * sizeof(char16_t)); 808 if(result != 0) { 809 return (int8_t)(result >> 15 | 1); 810 } 811 # else 812 // little-endian: compare char16_t units 813 do { 814 result = static_cast<int32_t>(*(chars++)) - static_cast<int32_t>(*(srcChars++)); 815 if(result != 0) { 816 return static_cast<int8_t>(result >> 15 | 1); 817 } 818 } while(--minLength > 0); 819 # endif 820 } 821 return lengthResult; 822 } 823 824 /* String compare in code point order - doCompare() compares in code unit order. */ 825 int8_t 826 UnicodeString::doCompareCodePointOrder(int32_t start, 827 int32_t length, 828 const char16_t *srcChars, 829 int32_t srcStart, 830 int32_t srcLength) const 831 { 832 // compare illegal string values 833 // treat const char16_t *srcChars==nullptr as an empty string 834 if(isBogus()) { 835 return -1; 836 } 837 838 // pin indices to legal values 839 pinIndices(start, length); 840 841 if(srcChars == nullptr) { 842 srcStart = srcLength = 0; 843 } 844 845 int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=nullptr)?(srcChars + srcStart):nullptr, srcLength, false, true); 846 /* translate the 32-bit result into an 8-bit one */ 847 if(diff!=0) { 848 return static_cast<int8_t>(diff >> 15 | 1); 849 } else { 850 return 0; 851 } 852 } 853 854 int32_t 855 UnicodeString::getLength() const { 856 return length(); 857 } 858 859 char16_t 860 UnicodeString::getCharAt(int32_t offset) const { 861 return charAt(offset); 862 } 863 864 UChar32 865 UnicodeString::getChar32At(int32_t offset) const { 866 return char32At(offset); 867 } 868 869 UChar32 870 UnicodeString::char32At(int32_t offset) const 871 { 872 int32_t len = length(); 873 if (static_cast<uint32_t>(offset) < static_cast<uint32_t>(len)) { 874 const char16_t *array = getArrayStart(); 875 UChar32 c; 876 U16_GET(array, 0, offset, len, c); 877 return c; 878 } else { 879 return kInvalidUChar; 880 } 881 } 882 883 int32_t 884 UnicodeString::getChar32Start(int32_t offset) const { 885 if (static_cast<uint32_t>(offset) < static_cast<uint32_t>(length())) { 886 const char16_t *array = getArrayStart(); 887 U16_SET_CP_START(array, 0, offset); 888 return offset; 889 } else { 890 return 0; 891 } 892 } 893 894 int32_t 895 UnicodeString::getChar32Limit(int32_t offset) const { 896 int32_t len = length(); 897 if (static_cast<uint32_t>(offset) < static_cast<uint32_t>(len)) { 898 const char16_t *array = getArrayStart(); 899 U16_SET_CP_LIMIT(array, 0, offset, len); 900 return offset; 901 } else { 902 return len; 903 } 904 } 905 906 int32_t 907 UnicodeString::countChar32(int32_t start, int32_t length) const { 908 pinIndices(start, length); 909 // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for nullptr 910 return u_countChar32(getArrayStart()+start, length); 911 } 912 913 UBool 914 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const { 915 pinIndices(start, length); 916 // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for nullptr 917 return u_strHasMoreChar32Than(getArrayStart()+start, length, number); 918 } 919 920 int32_t 921 UnicodeString::moveIndex32(int32_t index, int32_t delta) const { 922 // pin index 923 int32_t len = length(); 924 if(index<0) { 925 index=0; 926 } else if(index>len) { 927 index=len; 928 } 929 930 const char16_t *array = getArrayStart(); 931 if(delta>0) { 932 U16_FWD_N(array, index, len, delta); 933 } else { 934 U16_BACK_N(array, 0, index, -delta); 935 } 936 937 return index; 938 } 939 940 void 941 UnicodeString::doExtract(int32_t start, 942 int32_t length, 943 char16_t *dst, 944 int32_t dstStart) const 945 { 946 // pin indices to legal values 947 pinIndices(start, length); 948 949 // do not copy anything if we alias dst itself 950 const char16_t *array = getArrayStart(); 951 if(array + start != dst + dstStart) { 952 us_arrayCopy(array, start, dst, dstStart, length); 953 } 954 } 955 956 int32_t 957 UnicodeString::extract(Char16Ptr dest, int32_t destCapacity, 958 UErrorCode &errorCode) const { 959 int32_t len = length(); 960 if(U_SUCCESS(errorCode)) { 961 if (isBogus() || destCapacity < 0 || (destCapacity > 0 && dest == nullptr)) { 962 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 963 } else { 964 const char16_t *array = getArrayStart(); 965 if(len>0 && len<=destCapacity && array!=dest) { 966 u_memcpy(dest, array, len); 967 } 968 return u_terminateUChars(dest, destCapacity, len, &errorCode); 969 } 970 } 971 972 return len; 973 } 974 975 int32_t 976 UnicodeString::extract(int32_t start, 977 int32_t length, 978 char *target, 979 int32_t targetCapacity, 980 enum EInvariant) const 981 { 982 // if the arguments are illegal, then do nothing 983 if(targetCapacity < 0 || (targetCapacity > 0 && target == nullptr)) { 984 return 0; 985 } 986 987 // pin the indices to legal values 988 pinIndices(start, length); 989 990 if(length <= targetCapacity) { 991 u_UCharsToChars(getArrayStart() + start, target, length); 992 } 993 UErrorCode status = U_ZERO_ERROR; 994 return u_terminateChars(target, targetCapacity, length, &status); 995 } 996 997 UnicodeString 998 UnicodeString::tempSubString(int32_t start, int32_t len) const { 999 pinIndices(start, len); 1000 const char16_t *array = getBuffer(); // not getArrayStart() to check kIsBogus & kOpenGetBuffer 1001 if(array==nullptr) { 1002 array=fUnion.fStackFields.fBuffer; // anything not nullptr because that would make an empty string 1003 len=-2; // bogus result string 1004 } 1005 return UnicodeString(false, array + start, len); 1006 } 1007 1008 int32_t 1009 UnicodeString::toUTF8(int32_t start, int32_t len, 1010 char *target, int32_t capacity) const { 1011 pinIndices(start, len); 1012 int32_t length8; 1013 UErrorCode errorCode = U_ZERO_ERROR; 1014 u_strToUTF8WithSub(target, capacity, &length8, 1015 getBuffer() + start, len, 1016 0xFFFD, // Standard substitution character. 1017 nullptr, // Don't care about number of substitutions. 1018 &errorCode); 1019 return length8; 1020 } 1021 1022 #if U_CHARSET_IS_UTF8 1023 1024 int32_t 1025 UnicodeString::extract(int32_t start, int32_t len, 1026 char *target, uint32_t dstSize) const { 1027 // if the arguments are illegal, then do nothing 1028 if (/*dstSize < 0 || */(dstSize > 0 && target == nullptr)) { 1029 return 0; 1030 } 1031 return toUTF8(start, len, target, dstSize <= 0x7fffffff ? static_cast<int32_t>(dstSize) : 0x7fffffff); 1032 } 1033 1034 // else see unistr_cnv.cpp 1035 #endif 1036 1037 void 1038 UnicodeString::extractBetween(int32_t start, 1039 int32_t limit, 1040 UnicodeString& target) const { 1041 pinIndex(start); 1042 pinIndex(limit); 1043 doExtract(start, limit - start, target); 1044 } 1045 1046 // When converting from UTF-16 to UTF-8, the result will have at most 3 times 1047 // as many bytes as the source has UChars. 1048 // The "worst cases" are writing systems like Indic, Thai and CJK with 1049 // 3:1 bytes:UChars. 1050 void 1051 UnicodeString::toUTF8(ByteSink &sink) const { 1052 int32_t length16 = length(); 1053 if(length16 != 0) { 1054 char stackBuffer[1024]; 1055 int32_t capacity = static_cast<int32_t>(sizeof(stackBuffer)); 1056 UBool utf8IsOwned = false; 1057 char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity, 1058 3*length16, 1059 stackBuffer, capacity, 1060 &capacity); 1061 int32_t length8 = 0; 1062 UErrorCode errorCode = U_ZERO_ERROR; 1063 u_strToUTF8WithSub(utf8, capacity, &length8, 1064 getBuffer(), length16, 1065 0xFFFD, // Standard substitution character. 1066 nullptr, // Don't care about number of substitutions. 1067 &errorCode); 1068 if(errorCode == U_BUFFER_OVERFLOW_ERROR) { 1069 utf8 = static_cast<char*>(uprv_malloc(length8)); 1070 if(utf8 != nullptr) { 1071 utf8IsOwned = true; 1072 errorCode = U_ZERO_ERROR; 1073 u_strToUTF8WithSub(utf8, length8, &length8, 1074 getBuffer(), length16, 1075 0xFFFD, // Standard substitution character. 1076 nullptr, // Don't care about number of substitutions. 1077 &errorCode); 1078 } else { 1079 errorCode = U_MEMORY_ALLOCATION_ERROR; 1080 } 1081 } 1082 if(U_SUCCESS(errorCode)) { 1083 sink.Append(utf8, length8); 1084 sink.Flush(); 1085 } 1086 if(utf8IsOwned) { 1087 uprv_free(utf8); 1088 } 1089 } 1090 } 1091 1092 int32_t 1093 UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const { 1094 int32_t length32=0; 1095 if(U_SUCCESS(errorCode)) { 1096 // getBuffer() and u_strToUTF32WithSub() check for illegal arguments. 1097 u_strToUTF32WithSub(utf32, capacity, &length32, 1098 getBuffer(), length(), 1099 0xfffd, // Substitution character. 1100 nullptr, // Don't care about number of substitutions. 1101 &errorCode); 1102 } 1103 return length32; 1104 } 1105 1106 int32_t 1107 UnicodeString::indexOf(const char16_t *srcChars, 1108 int32_t srcStart, 1109 int32_t srcLength, 1110 int32_t start, 1111 int32_t length) const 1112 { 1113 if (isBogus() || srcChars == nullptr || srcStart < 0 || srcLength == 0) { 1114 return -1; 1115 } 1116 1117 // UnicodeString does not find empty substrings 1118 if(srcLength < 0 && srcChars[srcStart] == 0) { 1119 return -1; 1120 } 1121 1122 // get the indices within bounds 1123 pinIndices(start, length); 1124 1125 // find the first occurrence of the substring 1126 const char16_t *array = getArrayStart(); 1127 const char16_t *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength); 1128 if(match == nullptr) { 1129 return -1; 1130 } else { 1131 return static_cast<int32_t>(match - array); 1132 } 1133 } 1134 1135 int32_t 1136 UnicodeString::doIndexOf(char16_t c, 1137 int32_t start, 1138 int32_t length) const 1139 { 1140 // pin indices 1141 pinIndices(start, length); 1142 1143 // find the first occurrence of c 1144 const char16_t *array = getArrayStart(); 1145 const char16_t *match = u_memchr(array + start, c, length); 1146 if(match == nullptr) { 1147 return -1; 1148 } else { 1149 return static_cast<int32_t>(match - array); 1150 } 1151 } 1152 1153 int32_t 1154 UnicodeString::doIndexOf(UChar32 c, 1155 int32_t start, 1156 int32_t length) const { 1157 // pin indices 1158 pinIndices(start, length); 1159 1160 // find the first occurrence of c 1161 const char16_t *array = getArrayStart(); 1162 const char16_t *match = u_memchr32(array + start, c, length); 1163 if(match == nullptr) { 1164 return -1; 1165 } else { 1166 return static_cast<int32_t>(match - array); 1167 } 1168 } 1169 1170 int32_t 1171 UnicodeString::lastIndexOf(const char16_t *srcChars, 1172 int32_t srcStart, 1173 int32_t srcLength, 1174 int32_t start, 1175 int32_t length) const 1176 { 1177 if (isBogus() || srcChars == nullptr || srcStart < 0 || srcLength == 0) { 1178 return -1; 1179 } 1180 1181 // UnicodeString does not find empty substrings 1182 if(srcLength < 0 && srcChars[srcStart] == 0) { 1183 return -1; 1184 } 1185 1186 // get the indices within bounds 1187 pinIndices(start, length); 1188 1189 // find the last occurrence of the substring 1190 const char16_t *array = getArrayStart(); 1191 const char16_t *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength); 1192 if(match == nullptr) { 1193 return -1; 1194 } else { 1195 return static_cast<int32_t>(match - array); 1196 } 1197 } 1198 1199 int32_t 1200 UnicodeString::doLastIndexOf(char16_t c, 1201 int32_t start, 1202 int32_t length) const 1203 { 1204 if(isBogus()) { 1205 return -1; 1206 } 1207 1208 // pin indices 1209 pinIndices(start, length); 1210 1211 // find the last occurrence of c 1212 const char16_t *array = getArrayStart(); 1213 const char16_t *match = u_memrchr(array + start, c, length); 1214 if(match == nullptr) { 1215 return -1; 1216 } else { 1217 return static_cast<int32_t>(match - array); 1218 } 1219 } 1220 1221 int32_t 1222 UnicodeString::doLastIndexOf(UChar32 c, 1223 int32_t start, 1224 int32_t length) const { 1225 // pin indices 1226 pinIndices(start, length); 1227 1228 // find the last occurrence of c 1229 const char16_t *array = getArrayStart(); 1230 const char16_t *match = u_memrchr32(array + start, c, length); 1231 if(match == nullptr) { 1232 return -1; 1233 } else { 1234 return static_cast<int32_t>(match - array); 1235 } 1236 } 1237 1238 //======================================== 1239 // Write implementation 1240 //======================================== 1241 1242 UnicodeString& 1243 UnicodeString::findAndReplace(int32_t start, 1244 int32_t length, 1245 const UnicodeString& oldText, 1246 int32_t oldStart, 1247 int32_t oldLength, 1248 const UnicodeString& newText, 1249 int32_t newStart, 1250 int32_t newLength) 1251 { 1252 if(isBogus() || oldText.isBogus() || newText.isBogus()) { 1253 return *this; 1254 } 1255 1256 pinIndices(start, length); 1257 oldText.pinIndices(oldStart, oldLength); 1258 newText.pinIndices(newStart, newLength); 1259 1260 if(oldLength == 0) { 1261 return *this; 1262 } 1263 1264 while(length > 0 && length >= oldLength) { 1265 int32_t pos = indexOf(oldText, oldStart, oldLength, start, length); 1266 if(pos < 0) { 1267 // no more oldText's here: done 1268 break; 1269 } else { 1270 // we found oldText, replace it by newText and go beyond it 1271 replace(pos, oldLength, newText, newStart, newLength); 1272 length -= pos + oldLength - start; 1273 start = pos + newLength; 1274 } 1275 } 1276 1277 return *this; 1278 } 1279 1280 1281 void 1282 UnicodeString::setToBogus() 1283 { 1284 releaseArray(); 1285 1286 fUnion.fFields.fLengthAndFlags = kIsBogus; 1287 fUnion.fFields.fArray = nullptr; 1288 fUnion.fFields.fCapacity = 0; 1289 } 1290 1291 // turn a bogus string into an empty one 1292 void 1293 UnicodeString::unBogus() { 1294 if(fUnion.fFields.fLengthAndFlags & kIsBogus) { 1295 setToEmpty(); 1296 } 1297 } 1298 1299 const char16_t * 1300 UnicodeString::getTerminatedBuffer() { 1301 if(!isWritable()) { 1302 return nullptr; 1303 } 1304 char16_t *array = getArrayStart(); 1305 int32_t len = length(); 1306 if(len < getCapacity()) { 1307 if(fUnion.fFields.fLengthAndFlags & kBufferIsReadonly) { 1308 // If len<capacity on a read-only alias, then array[len] is 1309 // either the original NUL (if constructed with (true, s, length)) 1310 // or one of the original string contents characters (if later truncated), 1311 // therefore we can assume that array[len] is initialized memory. 1312 if(array[len] == 0) { 1313 return array; 1314 } 1315 } else if(((fUnion.fFields.fLengthAndFlags & kRefCounted) == 0 || refCount() == 1)) { 1316 // kRefCounted: Do not write the NUL if the buffer is shared. 1317 // That is mostly safe, except when the length of one copy was modified 1318 // without copy-on-write, e.g., via truncate(newLength) or remove(). 1319 // Then the NUL would be written into the middle of another copy's string. 1320 1321 // Otherwise, the buffer is fully writable and it is anyway safe to write the NUL. 1322 // Do not test if there is a NUL already because it might be uninitialized memory. 1323 // (That would be safe, but tools like valgrind & Purify would complain.) 1324 array[len] = 0; 1325 return array; 1326 } 1327 } 1328 if(len<INT32_MAX && cloneArrayIfNeeded(len+1)) { 1329 array = getArrayStart(); 1330 array[len] = 0; 1331 return array; 1332 } else { 1333 return nullptr; 1334 } 1335 } 1336 1337 // setTo() analogous to the readonly-aliasing constructor with the same signature 1338 UnicodeString & 1339 UnicodeString::setTo(UBool isTerminated, 1340 ConstChar16Ptr textPtr, 1341 int32_t textLength) 1342 { 1343 if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) { 1344 // do not modify a string that has an "open" getBuffer(minCapacity) 1345 return *this; 1346 } 1347 1348 const char16_t *text = textPtr; 1349 if(text == nullptr) { 1350 // treat as an empty string, do not alias 1351 releaseArray(); 1352 setToEmpty(); 1353 return *this; 1354 } 1355 1356 if( textLength < -1 || 1357 (textLength == -1 && !isTerminated) || 1358 (textLength >= 0 && isTerminated && text[textLength] != 0) 1359 ) { 1360 setToBogus(); 1361 return *this; 1362 } 1363 1364 releaseArray(); 1365 1366 if(textLength == -1) { 1367 // text is terminated, or else it would have failed the above test 1368 textLength = u_strlen(text); 1369 } 1370 fUnion.fFields.fLengthAndFlags = kReadonlyAlias; 1371 setArray(const_cast<char16_t*>(text), textLength, isTerminated ? textLength + 1 : textLength); 1372 return *this; 1373 } 1374 1375 // setTo() analogous to the writable-aliasing constructor with the same signature 1376 UnicodeString & 1377 UnicodeString::setTo(char16_t *buffer, 1378 int32_t buffLength, 1379 int32_t buffCapacity) { 1380 if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) { 1381 // do not modify a string that has an "open" getBuffer(minCapacity) 1382 return *this; 1383 } 1384 1385 if(buffer == nullptr) { 1386 // treat as an empty string, do not alias 1387 releaseArray(); 1388 setToEmpty(); 1389 return *this; 1390 } 1391 1392 if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) { 1393 setToBogus(); 1394 return *this; 1395 } else if(buffLength == -1) { 1396 // buffLength = u_strlen(buff); but do not look beyond buffCapacity 1397 const char16_t *p = buffer, *limit = buffer + buffCapacity; 1398 while(p != limit && *p != 0) { 1399 ++p; 1400 } 1401 buffLength = static_cast<int32_t>(p - buffer); 1402 } 1403 1404 releaseArray(); 1405 1406 fUnion.fFields.fLengthAndFlags = kWritableAlias; 1407 setArray(buffer, buffLength, buffCapacity); 1408 return *this; 1409 } 1410 1411 UnicodeString &UnicodeString::setToUTF8(StringPiece utf8) { 1412 unBogus(); 1413 int32_t length = utf8.length(); 1414 int32_t capacity; 1415 // The UTF-16 string will be at most as long as the UTF-8 string. 1416 if(length <= US_STACKBUF_SIZE) { 1417 capacity = US_STACKBUF_SIZE; 1418 } else { 1419 capacity = length + 1; // +1 for the terminating NUL. 1420 } 1421 char16_t *utf16 = getBuffer(capacity); 1422 int32_t length16; 1423 UErrorCode errorCode = U_ZERO_ERROR; 1424 u_strFromUTF8WithSub(utf16, getCapacity(), &length16, 1425 utf8.data(), length, 1426 0xfffd, // Substitution character. 1427 nullptr, // Don't care about number of substitutions. 1428 &errorCode); 1429 releaseBuffer(length16); 1430 if(U_FAILURE(errorCode)) { 1431 setToBogus(); 1432 } 1433 return *this; 1434 } 1435 1436 UnicodeString& 1437 UnicodeString::setCharAt(int32_t offset, 1438 char16_t c) 1439 { 1440 int32_t len = length(); 1441 if(cloneArrayIfNeeded() && len > 0) { 1442 if(offset < 0) { 1443 offset = 0; 1444 } else if(offset >= len) { 1445 offset = len - 1; 1446 } 1447 1448 getArrayStart()[offset] = c; 1449 } 1450 return *this; 1451 } 1452 1453 UnicodeString& 1454 UnicodeString::replace(int32_t start, 1455 int32_t _length, 1456 UChar32 srcChar) { 1457 char16_t buffer[U16_MAX_LENGTH]; 1458 int32_t count = 0; 1459 UBool isError = false; 1460 U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError); 1461 // We test isError so that the compiler does not complain that we don't. 1462 // If isError (srcChar is not a valid code point) then count==0 which means 1463 // we remove the source segment rather than replacing it with srcChar. 1464 return doReplace(start, _length, buffer, 0, isError ? 0 : count); 1465 } 1466 1467 UnicodeString& 1468 UnicodeString::append(UChar32 srcChar) { 1469 char16_t buffer[U16_MAX_LENGTH]; 1470 int32_t _length = 0; 1471 UBool isError = false; 1472 U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError); 1473 // We test isError so that the compiler does not complain that we don't. 1474 // If isError then _length==0 which turns the doAppend() into a no-op anyway. 1475 return isError ? *this : doAppend(buffer, 0, _length); 1476 } 1477 1478 UnicodeString& 1479 UnicodeString::doReplace( int32_t start, 1480 int32_t length, 1481 const UnicodeString& src, 1482 int32_t srcStart, 1483 int32_t srcLength) 1484 { 1485 // pin the indices to legal values 1486 src.pinIndices(srcStart, srcLength); 1487 1488 // get the characters from src 1489 // and replace the range in ourselves with them 1490 return doReplace(start, length, src.getArrayStart(), srcStart, srcLength); 1491 } 1492 1493 UnicodeString& 1494 UnicodeString::doReplace(int32_t start, 1495 int32_t length, 1496 const char16_t *srcChars, 1497 int32_t srcStart, 1498 int32_t srcLength) 1499 { 1500 if(!isWritable()) { 1501 return *this; 1502 } 1503 1504 int32_t oldLength = this->length(); 1505 1506 // optimize (read-only alias).remove(0, start) and .remove(start, end) 1507 if((fUnion.fFields.fLengthAndFlags&kBufferIsReadonly) && srcLength == 0) { 1508 if(start == 0) { 1509 // remove prefix by adjusting the array pointer 1510 pinIndex(length); 1511 fUnion.fFields.fArray += length; 1512 fUnion.fFields.fCapacity -= length; 1513 setLength(oldLength - length); 1514 return *this; 1515 } else { 1516 pinIndex(start); 1517 if(length >= (oldLength - start)) { 1518 // remove suffix by reducing the length (like truncate()) 1519 setLength(start); 1520 fUnion.fFields.fCapacity = start; // not NUL-terminated any more 1521 return *this; 1522 } 1523 } 1524 } 1525 1526 if(start == oldLength) { 1527 return doAppend(srcChars, srcStart, srcLength); 1528 } 1529 1530 if (srcChars == nullptr) { 1531 srcLength = 0; 1532 } else { 1533 // Perform all remaining operations relative to srcChars + srcStart. 1534 // From this point forward, do not use srcStart. 1535 srcChars += srcStart; 1536 if (srcLength < 0) { 1537 // get the srcLength if necessary 1538 srcLength = u_strlen(srcChars); 1539 } 1540 } 1541 1542 // pin the indices to legal values 1543 pinIndices(start, length); 1544 1545 // Calculate the size of the string after the replace. 1546 // Avoid int32_t overflow. 1547 int32_t newLength = oldLength - length; 1548 if(srcLength > (INT32_MAX - newLength)) { 1549 setToBogus(); 1550 return *this; 1551 } 1552 newLength += srcLength; 1553 1554 // Check for insertion into ourself 1555 const char16_t *oldArray = getArrayStart(); 1556 if (isBufferWritable() && 1557 oldArray < srcChars + srcLength && 1558 srcChars < oldArray + oldLength) { 1559 // Copy into a new UnicodeString and start over 1560 UnicodeString copy(srcChars, srcLength); 1561 if (copy.isBogus()) { 1562 setToBogus(); 1563 return *this; 1564 } 1565 return doReplace(start, length, copy.getArrayStart(), 0, srcLength); 1566 } 1567 1568 // cloneArrayIfNeeded(doCopyArray=false) may change fArray but will not copy the current contents; 1569 // therefore we need to keep the current fArray 1570 char16_t oldStackBuffer[US_STACKBUF_SIZE]; 1571 if((fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) { 1572 // copy the stack buffer contents because it will be overwritten with 1573 // fUnion.fFields values 1574 u_memcpy(oldStackBuffer, oldArray, oldLength); 1575 oldArray = oldStackBuffer; 1576 } 1577 1578 // clone our array and allocate a bigger array if needed 1579 int32_t *bufferToDelete = nullptr; 1580 if(!cloneArrayIfNeeded(newLength, getGrowCapacity(newLength), 1581 false, &bufferToDelete) 1582 ) { 1583 return *this; 1584 } 1585 1586 // now do the replace 1587 1588 char16_t *newArray = getArrayStart(); 1589 if(newArray != oldArray) { 1590 // if fArray changed, then we need to copy everything except what will change 1591 us_arrayCopy(oldArray, 0, newArray, 0, start); 1592 us_arrayCopy(oldArray, start + length, 1593 newArray, start + srcLength, 1594 oldLength - (start + length)); 1595 } else if(length != srcLength) { 1596 // fArray did not change; copy only the portion that isn't changing, leaving a hole 1597 us_arrayCopy(oldArray, start + length, 1598 newArray, start + srcLength, 1599 oldLength - (start + length)); 1600 } 1601 1602 // now fill in the hole with the new string 1603 us_arrayCopy(srcChars, 0, newArray, start, srcLength); 1604 1605 setLength(newLength); 1606 1607 // delayed delete in case srcChars == fArray when we started, and 1608 // to keep oldArray alive for the above operations 1609 if (bufferToDelete) { 1610 uprv_free(bufferToDelete); 1611 } 1612 1613 return *this; 1614 } 1615 1616 UnicodeString& 1617 UnicodeString::doReplace(int32_t start, int32_t length, std::u16string_view src) { 1618 if (!isWritable()) { 1619 return *this; 1620 } 1621 if (src.length() > INT32_MAX) { 1622 setToBogus(); 1623 return *this; 1624 } 1625 return doReplace(start, length, src.data(), 0, static_cast<int32_t>(src.length())); 1626 } 1627 1628 // Versions of doReplace() only for append() variants. 1629 // doReplace() and doAppend() optimize for different cases. 1630 1631 UnicodeString& 1632 UnicodeString::doAppend(const UnicodeString& src, int32_t srcStart, int32_t srcLength) { 1633 if(srcLength == 0) { 1634 return *this; 1635 } 1636 1637 // pin the indices to legal values 1638 src.pinIndices(srcStart, srcLength); 1639 return doAppend(src.getArrayStart(), srcStart, srcLength); 1640 } 1641 1642 UnicodeString& 1643 UnicodeString::doAppend(const char16_t *srcChars, int32_t srcStart, int32_t srcLength) { 1644 if(!isWritable() || srcLength == 0 || srcChars == nullptr) { 1645 return *this; 1646 } 1647 1648 // Perform all remaining operations relative to srcChars + srcStart. 1649 // From this point forward, do not use srcStart. 1650 srcChars += srcStart; 1651 1652 if(srcLength < 0) { 1653 // get the srcLength if necessary 1654 if((srcLength = u_strlen(srcChars)) == 0) { 1655 return *this; 1656 } 1657 } 1658 1659 int32_t oldLength = length(); 1660 int32_t newLength; 1661 1662 if (srcLength <= getCapacity() - oldLength && isBufferWritable()) { 1663 newLength = oldLength + srcLength; 1664 // Faster than a memmove 1665 if (srcLength <= 4) { 1666 char16_t *arr = getArrayStart(); 1667 arr[oldLength] = srcChars[0]; 1668 if (srcLength > 1) arr[oldLength+1] = srcChars[1]; 1669 if (srcLength > 2) arr[oldLength+2] = srcChars[2]; 1670 if (srcLength > 3) arr[oldLength+3] = srcChars[3]; 1671 setLength(newLength); 1672 return *this; 1673 } 1674 } else { 1675 if (uprv_add32_overflow(oldLength, srcLength, &newLength)) { 1676 setToBogus(); 1677 return *this; 1678 } 1679 1680 // Check for append onto ourself 1681 const char16_t* oldArray = getArrayStart(); 1682 if (isBufferWritable() && 1683 oldArray < srcChars + srcLength && 1684 srcChars < oldArray + oldLength) { 1685 // Copy into a new UnicodeString and start over 1686 UnicodeString copy(srcChars, srcLength); 1687 if (copy.isBogus()) { 1688 setToBogus(); 1689 return *this; 1690 } 1691 return doAppend(copy.getArrayStart(), 0, srcLength); 1692 } 1693 1694 // optimize append() onto a large-enough, owned string 1695 if (!cloneArrayIfNeeded(newLength, getGrowCapacity(newLength))) { 1696 return *this; 1697 } 1698 } 1699 1700 char16_t *newArray = getArrayStart(); 1701 // Do not copy characters when 1702 // char16_t *buffer=str.getAppendBuffer(...); 1703 // is followed by 1704 // str.append(buffer, length); 1705 // or 1706 // str.appendString(buffer, length) 1707 // or similar. 1708 if(srcChars != newArray + oldLength) { 1709 us_arrayCopy(srcChars, 0, newArray, oldLength, srcLength); 1710 } 1711 setLength(newLength); 1712 1713 return *this; 1714 } 1715 1716 UnicodeString& 1717 UnicodeString::doAppend(std::u16string_view src) { 1718 if (!isWritable() || src.empty()) { 1719 return *this; 1720 } 1721 if (src.length() > INT32_MAX) { 1722 setToBogus(); 1723 return *this; 1724 } 1725 return doAppend(src.data(), 0, static_cast<int32_t>(src.length())); 1726 } 1727 1728 /** 1729 * Replaceable API 1730 */ 1731 void 1732 UnicodeString::handleReplaceBetween(int32_t start, 1733 int32_t limit, 1734 const UnicodeString& text) { 1735 replaceBetween(start, limit, text); 1736 } 1737 1738 /** 1739 * Replaceable API 1740 */ 1741 void 1742 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) { 1743 if (limit <= start) { 1744 return; // Nothing to do; avoid bogus malloc call 1745 } 1746 char16_t* text = static_cast<char16_t*>(uprv_malloc(sizeof(char16_t) * (limit - start))); 1747 // Check to make sure text is not null. 1748 if (text != nullptr) { 1749 extractBetween(start, limit, text, 0); 1750 insert(dest, text, 0, limit - start); 1751 uprv_free(text); 1752 } 1753 } 1754 1755 /** 1756 * Replaceable API 1757 * 1758 * NOTE: This is for the Replaceable class. There is no rep.cpp, 1759 * so we implement this function here. 1760 */ 1761 UBool Replaceable::hasMetaData() const { 1762 return true; 1763 } 1764 1765 /** 1766 * Replaceable API 1767 */ 1768 UBool UnicodeString::hasMetaData() const { 1769 return false; 1770 } 1771 1772 UnicodeString& 1773 UnicodeString::doReverse(int32_t start, int32_t length) { 1774 if(length <= 1 || !cloneArrayIfNeeded()) { 1775 return *this; 1776 } 1777 1778 // pin the indices to legal values 1779 pinIndices(start, length); 1780 if(length <= 1) { // pinIndices() might have shrunk the length 1781 return *this; 1782 } 1783 1784 char16_t *left = getArrayStart() + start; 1785 char16_t *right = left + length - 1; // -1 for inclusive boundary (length>=2) 1786 char16_t swap; 1787 UBool hasSupplementary = false; 1788 1789 // Before the loop we know left<right because length>=2. 1790 do { 1791 hasSupplementary |= static_cast<UBool>(U16_IS_LEAD(swap = *left)); 1792 hasSupplementary |= static_cast<UBool>(U16_IS_LEAD(*left++ = *right)); 1793 *right-- = swap; 1794 } while(left < right); 1795 // Make sure to test the middle code unit of an odd-length string. 1796 // Redundant if the length is even. 1797 hasSupplementary |= static_cast<UBool>(U16_IS_LEAD(*left)); 1798 1799 /* if there are supplementary code points in the reversed range, then re-swap their surrogates */ 1800 if(hasSupplementary) { 1801 char16_t swap2; 1802 1803 left = getArrayStart() + start; 1804 right = left + length - 1; // -1 so that we can look at *(left+1) if left<right 1805 while(left < right) { 1806 if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) { 1807 *left++ = swap2; 1808 *left++ = swap; 1809 } else { 1810 ++left; 1811 } 1812 } 1813 } 1814 1815 return *this; 1816 } 1817 1818 UBool 1819 UnicodeString::padLeading(int32_t targetLength, 1820 char16_t padChar) 1821 { 1822 int32_t oldLength = length(); 1823 if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) { 1824 return false; 1825 } else { 1826 // move contents up by padding width 1827 char16_t *array = getArrayStart(); 1828 int32_t start = targetLength - oldLength; 1829 us_arrayCopy(array, 0, array, start, oldLength); 1830 1831 // fill in padding character 1832 while(--start >= 0) { 1833 array[start] = padChar; 1834 } 1835 setLength(targetLength); 1836 return true; 1837 } 1838 } 1839 1840 UBool 1841 UnicodeString::padTrailing(int32_t targetLength, 1842 char16_t padChar) 1843 { 1844 int32_t oldLength = length(); 1845 if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) { 1846 return false; 1847 } else { 1848 // fill in padding character 1849 char16_t *array = getArrayStart(); 1850 int32_t length = targetLength; 1851 while(--length >= oldLength) { 1852 array[length] = padChar; 1853 } 1854 setLength(targetLength); 1855 return true; 1856 } 1857 } 1858 1859 //======================================== 1860 // Hashing 1861 //======================================== 1862 int32_t 1863 UnicodeString::doHashCode() const 1864 { 1865 /* Delegate hash computation to uhash. This makes UnicodeString 1866 * hashing consistent with char16_t* hashing. */ 1867 int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length()); 1868 if (hashCode == kInvalidHashCode) { 1869 hashCode = kEmptyHashCode; 1870 } 1871 return hashCode; 1872 } 1873 1874 //======================================== 1875 // External Buffer 1876 //======================================== 1877 1878 char16_t * 1879 UnicodeString::getBuffer(int32_t minCapacity) { 1880 if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) { 1881 fUnion.fFields.fLengthAndFlags|=kOpenGetBuffer; 1882 setZeroLength(); 1883 return getArrayStart(); 1884 } else { 1885 return nullptr; 1886 } 1887 } 1888 1889 void 1890 UnicodeString::releaseBuffer(int32_t newLength) { 1891 if(fUnion.fFields.fLengthAndFlags&kOpenGetBuffer && newLength>=-1) { 1892 // set the new fLength 1893 int32_t capacity=getCapacity(); 1894 if(newLength==-1) { 1895 // the new length is the string length, capped by fCapacity 1896 const char16_t *array=getArrayStart(), *p=array, *limit=array+capacity; 1897 while(p<limit && *p!=0) { 1898 ++p; 1899 } 1900 newLength = static_cast<int32_t>(p - array); 1901 } else if(newLength>capacity) { 1902 newLength=capacity; 1903 } 1904 setLength(newLength); 1905 fUnion.fFields.fLengthAndFlags&=~kOpenGetBuffer; 1906 } 1907 } 1908 1909 //======================================== 1910 // Miscellaneous 1911 //======================================== 1912 UBool 1913 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity, 1914 int32_t growCapacity, 1915 UBool doCopyArray, 1916 int32_t **pBufferToDelete, 1917 UBool forceClone) { 1918 // default parameters need to be static, therefore 1919 // the defaults are -1 to have convenience defaults 1920 if(newCapacity == -1) { 1921 newCapacity = getCapacity(); 1922 } 1923 1924 // while a getBuffer(minCapacity) is "open", 1925 // prevent any modifications of the string by returning false here 1926 // if the string is bogus, then only an assignment or similar can revive it 1927 if(!isWritable()) { 1928 return false; 1929 } 1930 1931 /* 1932 * We need to make a copy of the array if 1933 * the buffer is read-only, or 1934 * the buffer is refCounted (shared), and refCount>1, or 1935 * the buffer is too small. 1936 * Return false if memory could not be allocated. 1937 */ 1938 if(forceClone || 1939 fUnion.fFields.fLengthAndFlags & kBufferIsReadonly || 1940 (fUnion.fFields.fLengthAndFlags & kRefCounted && refCount() > 1) || 1941 newCapacity > getCapacity() 1942 ) { 1943 // check growCapacity for default value and use of the stack buffer 1944 if(growCapacity < 0) { 1945 growCapacity = newCapacity; 1946 } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) { 1947 growCapacity = US_STACKBUF_SIZE; 1948 } else if(newCapacity > growCapacity) { 1949 setToBogus(); 1950 return false; // bad inputs 1951 } 1952 if(growCapacity > kMaxCapacity) { 1953 setToBogus(); 1954 return false; 1955 } 1956 1957 // save old values 1958 char16_t oldStackBuffer[US_STACKBUF_SIZE]; 1959 char16_t *oldArray; 1960 int32_t oldLength = length(); 1961 int16_t flags = fUnion.fFields.fLengthAndFlags; 1962 1963 if(flags&kUsingStackBuffer) { 1964 U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */ 1965 if(doCopyArray && growCapacity > US_STACKBUF_SIZE) { 1966 // copy the stack buffer contents because it will be overwritten with 1967 // fUnion.fFields values 1968 us_arrayCopy(fUnion.fStackFields.fBuffer, 0, oldStackBuffer, 0, oldLength); 1969 oldArray = oldStackBuffer; 1970 } else { 1971 oldArray = nullptr; // no need to copy from the stack buffer to itself 1972 } 1973 } else { 1974 oldArray = fUnion.fFields.fArray; 1975 U_ASSERT(oldArray!=nullptr); /* when stack buffer is not used, oldArray must have a non-nullptr reference */ 1976 } 1977 1978 // allocate a new array 1979 if(allocate(growCapacity) || 1980 (newCapacity < growCapacity && allocate(newCapacity)) 1981 ) { 1982 if(doCopyArray) { 1983 // copy the contents 1984 // do not copy more than what fits - it may be smaller than before 1985 int32_t minLength = oldLength; 1986 newCapacity = getCapacity(); 1987 if(newCapacity < minLength) { 1988 minLength = newCapacity; 1989 } 1990 if(oldArray != nullptr) { 1991 us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength); 1992 } 1993 setLength(minLength); 1994 } else { 1995 setZeroLength(); 1996 } 1997 1998 // release the old array 1999 if(flags & kRefCounted) { 2000 // the array is refCounted; decrement and release if 0 2001 u_atomic_int32_t* pRefCount = reinterpret_cast<u_atomic_int32_t*>(oldArray) - 1; 2002 if(umtx_atomic_dec(pRefCount) == 0) { 2003 if (pBufferToDelete == nullptr) { 2004 // Note: cast to (void *) is needed with MSVC, where u_atomic_int32_t 2005 // is defined as volatile. (Volatile has useful non-standard behavior 2006 // with this compiler.) 2007 uprv_free((void *)pRefCount); 2008 } else { 2009 // the caller requested to delete it himself 2010 *pBufferToDelete = reinterpret_cast<int32_t*>(pRefCount); 2011 } 2012 } 2013 } 2014 } else { 2015 // not enough memory for growCapacity and not even for the smaller newCapacity 2016 // reset the old values for setToBogus() to release the array 2017 if(!(flags&kUsingStackBuffer)) { 2018 fUnion.fFields.fArray = oldArray; 2019 } 2020 fUnion.fFields.fLengthAndFlags = flags; 2021 setToBogus(); 2022 return false; 2023 } 2024 } 2025 return true; 2026 } 2027 2028 // UnicodeStringAppendable ------------------------------------------------- *** 2029 2030 UnicodeStringAppendable::~UnicodeStringAppendable() {} 2031 2032 UBool 2033 UnicodeStringAppendable::appendCodeUnit(char16_t c) { 2034 return str.doAppend(&c, 0, 1).isWritable(); 2035 } 2036 2037 UBool 2038 UnicodeStringAppendable::appendCodePoint(UChar32 c) { 2039 char16_t buffer[U16_MAX_LENGTH]; 2040 int32_t cLength = 0; 2041 UBool isError = false; 2042 U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError); 2043 return !isError && str.doAppend(buffer, 0, cLength).isWritable(); 2044 } 2045 2046 UBool 2047 UnicodeStringAppendable::appendString(const char16_t *s, int32_t length) { 2048 return str.doAppend(s, 0, length).isWritable(); 2049 } 2050 2051 UBool 2052 UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) { 2053 return str.cloneArrayIfNeeded(str.length() + appendCapacity); 2054 } 2055 2056 char16_t * 2057 UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity, 2058 int32_t desiredCapacityHint, 2059 char16_t *scratch, int32_t scratchCapacity, 2060 int32_t *resultCapacity) { 2061 if(minCapacity < 1 || scratchCapacity < minCapacity) { 2062 *resultCapacity = 0; 2063 return nullptr; 2064 } 2065 int32_t oldLength = str.length(); 2066 if(minCapacity <= (kMaxCapacity - oldLength) && 2067 desiredCapacityHint <= (kMaxCapacity - oldLength) && 2068 str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) { 2069 *resultCapacity = str.getCapacity() - oldLength; 2070 return str.getArrayStart() + oldLength; 2071 } 2072 *resultCapacity = scratchCapacity; 2073 return scratch; 2074 } 2075 2076 U_NAMESPACE_END 2077 2078 U_NAMESPACE_USE 2079 2080 U_CAPI int32_t U_EXPORT2 2081 uhash_hashUnicodeString(const UElement key) { 2082 const UnicodeString *str = (const UnicodeString*) key.pointer; 2083 return (str == nullptr) ? 0 : str->hashCode(); 2084 } 2085 2086 // Moved here from uhash_us.cpp so that using a UVector of UnicodeString* 2087 // does not depend on hashtable code. 2088 U_CAPI UBool U_EXPORT2 2089 uhash_compareUnicodeString(const UElement key1, const UElement key2) { 2090 const UnicodeString *str1 = (const UnicodeString*) key1.pointer; 2091 const UnicodeString *str2 = (const UnicodeString*) key2.pointer; 2092 if (str1 == str2) { 2093 return true; 2094 } 2095 if (str1 == nullptr || str2 == nullptr) { 2096 return false; 2097 } 2098 return *str1 == *str2; 2099 } 2100 2101 #ifdef U_STATIC_IMPLEMENTATION 2102 /* 2103 This should never be called. It is defined here to make sure that the 2104 virtual vector deleting destructor is defined within unistr.cpp. 2105 The vector deleting destructor is already a part of UObject, 2106 but defining it here makes sure that it is included with this object file. 2107 This makes sure that static library dependencies are kept to a minimum. 2108 */ 2109 #if defined(__clang__) || U_GCC_MAJOR_MINOR >= 1100 2110 #pragma GCC diagnostic push 2111 #pragma GCC diagnostic ignored "-Wunused-function" 2112 static void uprv_UnicodeStringDummy() { 2113 delete [] (new UnicodeString[2]); 2114 } 2115 #pragma GCC diagnostic pop 2116 #endif 2117 #endif