uniset.cpp (72729B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 1999-2015, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 * Date Name Description 9 * 10/20/99 alan Creation. 10 ********************************************************************** 11 */ 12 13 #include "unicode/utypes.h" 14 #include "unicode/parsepos.h" 15 #include "unicode/symtable.h" 16 #include "unicode/uniset.h" 17 #include "unicode/ustring.h" 18 #include "unicode/utf8.h" 19 #include "unicode/utf16.h" 20 #include "ruleiter.h" 21 #include "cmemory.h" 22 #include "cstring.h" 23 #include "patternprops.h" 24 #include "uelement.h" 25 #include "util.h" 26 #include "uvector.h" 27 #include "charstr.h" 28 #include "ustrfmt.h" 29 #include "uassert.h" 30 #include "bmpset.h" 31 #include "unisetspan.h" 32 33 // HIGH_VALUE > all valid values. 110000 for codepoints 34 #define UNICODESET_HIGH 0x0110000 35 36 // LOW <= all valid values. ZERO for codepoints 37 #define UNICODESET_LOW 0x000000 38 39 /** Max list [0, 1, 2, ..., max code point, HIGH] */ 40 constexpr int32_t MAX_LENGTH = UNICODESET_HIGH + 1; 41 42 U_NAMESPACE_BEGIN 43 44 SymbolTable::~SymbolTable() {} 45 46 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet) 47 48 /** 49 * Modify the given UChar32 variable so that it is in range, by 50 * pinning values < UNICODESET_LOW to UNICODESET_LOW, and 51 * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1. 52 * It modifies its argument in-place and also returns it. 53 */ 54 static inline UChar32 pinCodePoint(UChar32& c) { 55 if (c < UNICODESET_LOW) { 56 c = UNICODESET_LOW; 57 } else if (c > (UNICODESET_HIGH-1)) { 58 c = (UNICODESET_HIGH-1); 59 } 60 return c; 61 } 62 63 //---------------------------------------------------------------- 64 // Debugging 65 //---------------------------------------------------------------- 66 67 // DO NOT DELETE THIS CODE. This code is used to debug memory leaks. 68 // To enable the debugging, define the symbol DEBUG_MEM in the line 69 // below. This will result in text being sent to stdout that looks 70 // like this: 71 // DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85- 72 // DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85- 73 // Each line lists a construction (ct) or destruction (dt) event, the 74 // object address, the number of outstanding objects after the event, 75 // and the pattern of the object in question. 76 77 // #define DEBUG_MEM 78 79 #ifdef DEBUG_MEM 80 #include <stdio.h> 81 static int32_t _dbgCount = 0; 82 83 static inline void _dbgct(UnicodeSet* set) { 84 UnicodeString str; 85 set->toPattern(str, true); 86 char buf[40]; 87 str.extract(0, 39, buf, ""); 88 printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf); 89 } 90 91 static inline void _dbgdt(UnicodeSet* set) { 92 UnicodeString str; 93 set->toPattern(str, true); 94 char buf[40]; 95 str.extract(0, 39, buf, ""); 96 printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf); 97 } 98 99 #else 100 101 #define _dbgct(set) 102 #define _dbgdt(set) 103 104 #endif 105 106 //---------------------------------------------------------------- 107 // UnicodeString in UVector support 108 //---------------------------------------------------------------- 109 110 static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) { 111 dst->pointer = new UnicodeString(*static_cast<UnicodeString*>(src->pointer)); 112 } 113 114 static int32_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) { 115 const UnicodeString& a = *static_cast<const UnicodeString*>(t1.pointer); 116 const UnicodeString& b = *static_cast<const UnicodeString*>(t2.pointer); 117 return a.compare(b); 118 } 119 120 UBool UnicodeSet::hasStrings() const { 121 return strings_ != nullptr && !strings_->isEmpty(); 122 } 123 124 int32_t UnicodeSet::stringsSize() const { 125 return strings_ == nullptr ? 0 : strings_->size(); 126 } 127 128 UBool UnicodeSet::stringsContains(const UnicodeString &s) const { 129 return strings_ != nullptr && strings_->contains((void*) &s); 130 } 131 132 //---------------------------------------------------------------- 133 // Constructors &c 134 //---------------------------------------------------------------- 135 136 /** 137 * Constructs an empty set. 138 */ 139 UnicodeSet::UnicodeSet() { 140 list[0] = UNICODESET_HIGH; 141 _dbgct(this); 142 } 143 144 /** 145 * Constructs a set containing the given range. If <code>end > 146 * start</code> then an empty set is created. 147 * 148 * @param start first character, inclusive, of range 149 * @param end last character, inclusive, of range 150 */ 151 UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) { 152 list[0] = UNICODESET_HIGH; 153 add(start, end); 154 _dbgct(this); 155 } 156 157 /** 158 * Constructs a set that is identical to the given UnicodeSet. 159 */ 160 UnicodeSet::UnicodeSet(const UnicodeSet& o) : UnicodeFilter(o) { 161 *this = o; 162 _dbgct(this); 163 } 164 165 // Copy-construct as thawed. 166 UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : UnicodeFilter(o) { 167 if (ensureCapacity(o.len)) { 168 // *this = o except for bmpSet and stringSpan 169 len = o.len; 170 uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32)); 171 if (o.hasStrings()) { 172 UErrorCode status = U_ZERO_ERROR; 173 if (!allocateStrings(status) || 174 (strings_->assign(*o.strings_, cloneUnicodeString, status), U_FAILURE(status))) { 175 setToBogus(); 176 return; 177 } 178 } 179 if (o.pat) { 180 setPattern(o.pat, o.patLen); 181 } 182 _dbgct(this); 183 } 184 } 185 186 /** 187 * Destructs the set. 188 */ 189 UnicodeSet::~UnicodeSet() { 190 _dbgdt(this); // first! 191 if (list != stackList) { 192 uprv_free(list); 193 } 194 delete bmpSet; 195 if (buffer != stackList) { 196 uprv_free(buffer); 197 } 198 delete strings_; 199 delete stringSpan; 200 releasePattern(); 201 } 202 203 /** 204 * Assigns this object to be a copy of another. 205 */ 206 UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) { 207 return copyFrom(o, false); 208 } 209 210 UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) { 211 if (this == &o) { 212 return *this; 213 } 214 if (isFrozen()) { 215 return *this; 216 } 217 if (o.isBogus()) { 218 setToBogus(); 219 return *this; 220 } 221 if (!ensureCapacity(o.len)) { 222 // ensureCapacity will mark the UnicodeSet as Bogus if OOM failure happens. 223 return *this; 224 } 225 len = o.len; 226 uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32)); 227 if (o.bmpSet != nullptr && !asThawed) { 228 bmpSet = new BMPSet(*o.bmpSet, list, len); 229 if (bmpSet == nullptr) { // Check for memory allocation error. 230 setToBogus(); 231 return *this; 232 } 233 } 234 if (o.hasStrings()) { 235 UErrorCode status = U_ZERO_ERROR; 236 if ((strings_ == nullptr && !allocateStrings(status)) || 237 (strings_->assign(*o.strings_, cloneUnicodeString, status), U_FAILURE(status))) { 238 setToBogus(); 239 return *this; 240 } 241 } else if (hasStrings()) { 242 strings_->removeAllElements(); 243 } 244 if (o.stringSpan != nullptr && !asThawed) { 245 stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings_); 246 if (stringSpan == nullptr) { // Check for memory allocation error. 247 setToBogus(); 248 return *this; 249 } 250 } 251 releasePattern(); 252 if (o.pat) { 253 setPattern(o.pat, o.patLen); 254 } 255 return *this; 256 } 257 258 /** 259 * Returns a copy of this object. All UnicodeMatcher objects have 260 * to support cloning in order to allow classes using 261 * UnicodeMatchers, such as Transliterator, to implement cloning. 262 */ 263 UnicodeSet* UnicodeSet::clone() const { 264 return new UnicodeSet(*this); 265 } 266 267 UnicodeSet *UnicodeSet::cloneAsThawed() const { 268 return new UnicodeSet(*this, true); 269 } 270 271 /** 272 * Compares the specified object with this set for equality. Returns 273 * <tt>true</tt> if the two sets 274 * have the same size, and every member of the specified set is 275 * contained in this set (or equivalently, every member of this set is 276 * contained in the specified set). 277 * 278 * @param o set to be compared for equality with this set. 279 * @return <tt>true</tt> if the specified set is equal to this set. 280 */ 281 bool UnicodeSet::operator==(const UnicodeSet& o) const { 282 if (len != o.len) return false; 283 for (int32_t i = 0; i < len; ++i) { 284 if (list[i] != o.list[i]) return false; 285 } 286 if (hasStrings() != o.hasStrings()) { return false; } 287 if (hasStrings() && *strings_ != *o.strings_) return false; 288 return true; 289 } 290 291 /** 292 * Returns the hash code value for this set. 293 * 294 * @return the hash code value for this set. 295 * @see Object#hashCode() 296 */ 297 int32_t UnicodeSet::hashCode() const { 298 uint32_t result = static_cast<uint32_t>(len); 299 for (int32_t i = 0; i < len; ++i) { 300 result *= 1000003u; 301 result += list[i]; 302 } 303 return static_cast<int32_t>(result); 304 } 305 306 //---------------------------------------------------------------- 307 // Public API 308 //---------------------------------------------------------------- 309 310 /** 311 * Returns the number of elements in this set (its cardinality), 312 * Note than the elements of a set may include both individual 313 * codepoints and strings. 314 * 315 * @return the number of elements in this set (its cardinality). 316 */ 317 int32_t UnicodeSet::size() const { 318 int32_t n = 0; 319 int32_t count = getRangeCount(); 320 for (int32_t i = 0; i < count; ++i) { 321 n += getRangeEnd(i) - getRangeStart(i) + 1; 322 } 323 return n + stringsSize(); 324 } 325 326 /** 327 * Returns <tt>true</tt> if this set contains no elements. 328 * 329 * @return <tt>true</tt> if this set contains no elements. 330 */ 331 UBool UnicodeSet::isEmpty() const { 332 return len == 1 && !hasStrings(); 333 } 334 335 /** 336 * Returns true if this set contains the given character. 337 * @param c character to be checked for containment 338 * @return true if the test condition is met 339 */ 340 UBool UnicodeSet::contains(UChar32 c) const { 341 // Set i to the index of the start item greater than ch 342 // We know we will terminate without length test! 343 // LATER: for large sets, add binary search 344 //int32_t i = -1; 345 //for (;;) { 346 // if (c < list[++i]) break; 347 //} 348 if (bmpSet != nullptr) { 349 return bmpSet->contains(c); 350 } 351 if (stringSpan != nullptr) { 352 return stringSpan->contains(c); 353 } 354 if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound 355 return false; 356 } 357 int32_t i = findCodePoint(c); 358 return i & 1; // return true if odd 359 } 360 361 /** 362 * Returns the smallest value i such that c < list[i]. Caller 363 * must ensure that c is a legal value or this method will enter 364 * an infinite loop. This method performs a binary search. 365 * @param c a character in the range MIN_VALUE..MAX_VALUE 366 * inclusive 367 * @return the smallest integer i in the range 0..len-1, 368 * inclusive, such that c < list[i] 369 */ 370 int32_t UnicodeSet::findCodePoint(UChar32 c) const { 371 /* Examples: 372 findCodePoint(c) 373 set list[] c=0 1 3 4 7 8 374 === ============== =========== 375 [] [110000] 0 0 0 0 0 0 376 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 377 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 378 [:Any:] [0, 110000] 1 1 1 1 1 1 379 */ 380 381 // Return the smallest i such that c < list[i]. Assume 382 // list[len - 1] == HIGH and that c is legal (0..HIGH-1). 383 if (c < list[0]) 384 return 0; 385 // High runner test. c is often after the last range, so an 386 // initial check for this condition pays off. 387 int32_t lo = 0; 388 int32_t hi = len - 1; 389 if (lo >= hi || c >= list[hi-1]) 390 return hi; 391 // invariant: c >= list[lo] 392 // invariant: c < list[hi] 393 for (;;) { 394 int32_t i = (lo + hi) >> 1; 395 if (i == lo) { 396 break; // Found! 397 } else if (c < list[i]) { 398 hi = i; 399 } else { 400 lo = i; 401 } 402 } 403 return hi; 404 } 405 406 /** 407 * Returns true if this set contains every character 408 * of the given range. 409 * @param start first character, inclusive, of the range 410 * @param end last character, inclusive, of the range 411 * @return true if the test condition is met 412 */ 413 UBool UnicodeSet::contains(UChar32 start, UChar32 end) const { 414 //int32_t i = -1; 415 //for (;;) { 416 // if (start < list[++i]) break; 417 //} 418 int32_t i = findCodePoint(start); 419 return ((i & 1) != 0 && end < list[i]); 420 } 421 422 /** 423 * Returns <tt>true</tt> if this set contains the given 424 * multicharacter string. 425 * @param s string to be checked for containment 426 * @return <tt>true</tt> if this set contains the specified string 427 */ 428 UBool UnicodeSet::contains(const UnicodeString& s) const { 429 int32_t cp = getSingleCP(s); 430 if (cp < 0) { 431 return stringsContains(s); 432 } else { 433 return contains(static_cast<UChar32>(cp)); 434 } 435 } 436 437 /** 438 * Returns true if this set contains all the characters and strings 439 * of the given set. 440 * @param c set to be checked for containment 441 * @return true if the test condition is met 442 */ 443 UBool UnicodeSet::containsAll(const UnicodeSet& c) const { 444 // The specified set is a subset if all of its pairs are contained in 445 // this set. It's possible to code this more efficiently in terms of 446 // direct manipulation of the inversion lists if the need arises. 447 int32_t n = c.getRangeCount(); 448 for (int i=0; i<n; ++i) { 449 if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) { 450 return false; 451 } 452 } 453 return !c.hasStrings() || (strings_ != nullptr && strings_->containsAll(*c.strings_)); 454 } 455 456 /** 457 * Returns true if this set contains all the characters 458 * of the given string. 459 * @param s string containing characters to be checked for containment 460 * @return true if the test condition is met 461 */ 462 UBool UnicodeSet::containsAll(const UnicodeString& s) const { 463 return span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) == s.length(); 464 } 465 466 /** 467 * Returns true if this set contains none of the characters 468 * of the given range. 469 * @param start first character, inclusive, of the range 470 * @param end last character, inclusive, of the range 471 * @return true if the test condition is met 472 */ 473 UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const { 474 //int32_t i = -1; 475 //for (;;) { 476 // if (start < list[++i]) break; 477 //} 478 int32_t i = findCodePoint(start); 479 return ((i & 1) == 0 && end < list[i]); 480 } 481 482 /** 483 * Returns true if this set contains none of the characters and strings 484 * of the given set. 485 * @param c set to be checked for containment 486 * @return true if the test condition is met 487 */ 488 UBool UnicodeSet::containsNone(const UnicodeSet& c) const { 489 // The specified set is a subset if all of its pairs are contained in 490 // this set. It's possible to code this more efficiently in terms of 491 // direct manipulation of the inversion lists if the need arises. 492 int32_t n = c.getRangeCount(); 493 for (int32_t i=0; i<n; ++i) { 494 if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) { 495 return false; 496 } 497 } 498 return strings_ == nullptr || !c.hasStrings() || strings_->containsNone(*c.strings_); 499 } 500 501 /** 502 * Returns true if this set contains none of the characters 503 * of the given string. 504 * @param s string containing characters to be checked for containment 505 * @return true if the test condition is met 506 */ 507 UBool UnicodeSet::containsNone(const UnicodeString& s) const { 508 return span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) == s.length(); 509 } 510 511 /** 512 * Returns <tt>true</tt> if this set contains any character whose low byte 513 * is the given value. This is used by <tt>RuleBasedTransliterator</tt> for 514 * indexing. 515 */ 516 UBool UnicodeSet::matchesIndexValue(uint8_t v) const { 517 /* The index value v, in the range [0,255], is contained in this set if 518 * it is contained in any pair of this set. Pairs either have the high 519 * bytes equal, or unequal. If the high bytes are equal, then we have 520 * aaxx..aayy, where aa is the high byte. Then v is contained if xx <= 521 * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa. 522 * Then v is contained if xx <= v || v <= yy. (This is identical to the 523 * time zone month containment logic.) 524 */ 525 int32_t i; 526 int32_t rangeCount=getRangeCount(); 527 for (i=0; i<rangeCount; ++i) { 528 UChar32 low = getRangeStart(i); 529 UChar32 high = getRangeEnd(i); 530 if ((low & ~0xFF) == (high & ~0xFF)) { 531 if ((low & 0xFF) <= v && v <= (high & 0xFF)) { 532 return true; 533 } 534 } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) { 535 return true; 536 } 537 } 538 if (hasStrings()) { 539 for (i=0; i<strings_->size(); ++i) { 540 const UnicodeString& s = *static_cast<const UnicodeString*>(strings_->elementAt(i)); 541 if (s.isEmpty()) { 542 continue; // skip the empty string 543 } 544 UChar32 c = s.char32At(0); 545 if ((c & 0xFF) == v) { 546 return true; 547 } 548 } 549 } 550 return false; 551 } 552 553 /** 554 * Implementation of UnicodeMatcher::matches(). Always matches the 555 * longest possible multichar string. 556 */ 557 UMatchDegree UnicodeSet::matches(const Replaceable& text, 558 int32_t& offset, 559 int32_t limit, 560 UBool incremental) { 561 if (offset == limit) { 562 if (contains(U_ETHER)) { 563 return incremental ? U_PARTIAL_MATCH : U_MATCH; 564 } else { 565 return U_MISMATCH; 566 } 567 } else { 568 if (hasStrings()) { // try strings first 569 570 // might separate forward and backward loops later 571 // for now they are combined 572 573 // TODO Improve efficiency of this, at least in the forward 574 // direction, if not in both. In the forward direction we 575 // can assume the strings are sorted. 576 577 int32_t i; 578 UBool forward = offset < limit; 579 580 // firstChar is the leftmost char to match in the 581 // forward direction or the rightmost char to match in 582 // the reverse direction. 583 char16_t firstChar = text.charAt(offset); 584 585 // If there are multiple strings that can match we 586 // return the longest match. 587 int32_t highWaterLength = 0; 588 589 for (i=0; i<strings_->size(); ++i) { 590 const UnicodeString& trial = *static_cast<const UnicodeString*>(strings_->elementAt(i)); 591 if (trial.isEmpty()) { 592 continue; // skip the empty string 593 } 594 595 char16_t c = trial.charAt(forward ? 0 : trial.length() - 1); 596 597 // Strings are sorted, so we can optimize in the 598 // forward direction. 599 if (forward && c > firstChar) break; 600 if (c != firstChar) continue; 601 602 int32_t matchLen = matchRest(text, offset, limit, trial); 603 604 if (incremental) { 605 int32_t maxLen = forward ? limit-offset : offset-limit; 606 if (matchLen == maxLen) { 607 // We have successfully matched but only up to limit. 608 return U_PARTIAL_MATCH; 609 } 610 } 611 612 if (matchLen == trial.length()) { 613 // We have successfully matched the whole string. 614 if (matchLen > highWaterLength) { 615 highWaterLength = matchLen; 616 } 617 // In the forward direction we know strings 618 // are sorted so we can bail early. 619 if (forward && matchLen < highWaterLength) { 620 break; 621 } 622 continue; 623 } 624 } 625 626 // We've checked all strings without a partial match. 627 // If we have full matches, return the longest one. 628 if (highWaterLength != 0) { 629 offset += forward ? highWaterLength : -highWaterLength; 630 return U_MATCH; 631 } 632 } 633 return UnicodeFilter::matches(text, offset, limit, incremental); 634 } 635 } 636 637 /** 638 * Returns the longest match for s in text at the given position. 639 * If limit > start then match forward from start+1 to limit 640 * matching all characters except s.charAt(0). If limit < start, 641 * go backward starting from start-1 matching all characters 642 * except s.charAt(s.length()-1). This method assumes that the 643 * first character, text.charAt(start), matches s, so it does not 644 * check it. 645 * @param text the text to match 646 * @param start the first character to match. In the forward 647 * direction, text.charAt(start) is matched against s.charAt(0). 648 * In the reverse direction, it is matched against 649 * s.charAt(s.length()-1). 650 * @param limit the limit offset for matching, either last+1 in 651 * the forward direction, or last-1 in the reverse direction, 652 * where last is the index of the last character to match. 653 * @return If part of s matches up to the limit, return |limit - 654 * start|. If all of s matches before reaching the limit, return 655 * s.length(). If there is a mismatch between s and text, return 656 * 0 657 */ 658 int32_t UnicodeSet::matchRest(const Replaceable& text, 659 int32_t start, int32_t limit, 660 const UnicodeString& s) { 661 int32_t i; 662 int32_t maxLen; 663 int32_t slen = s.length(); 664 if (start < limit) { 665 maxLen = limit - start; 666 if (maxLen > slen) maxLen = slen; 667 for (i = 1; i < maxLen; ++i) { 668 if (text.charAt(start + i) != s.charAt(i)) return 0; 669 } 670 } else { 671 maxLen = start - limit; 672 if (maxLen > slen) maxLen = slen; 673 --slen; // <=> slen = s.length() - 1; 674 for (i = 1; i < maxLen; ++i) { 675 if (text.charAt(start - i) != s.charAt(slen - i)) return 0; 676 } 677 } 678 return maxLen; 679 } 680 681 /** 682 * Implement of UnicodeMatcher 683 */ 684 void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const { 685 toUnionTo.addAll(*this); 686 } 687 688 /** 689 * Returns the index of the given character within this set, where 690 * the set is ordered by ascending code point. If the character 691 * is not in this set, return -1. The inverse of this method is 692 * <code>charAt()</code>. 693 * @return an index from 0..size()-1, or -1 694 */ 695 int32_t UnicodeSet::indexOf(UChar32 c) const { 696 if (c < MIN_VALUE || c > MAX_VALUE) { 697 return -1; 698 } 699 int32_t i = 0; 700 int32_t n = 0; 701 for (;;) { 702 UChar32 start = list[i++]; 703 if (c < start) { 704 return -1; 705 } 706 UChar32 limit = list[i++]; 707 if (c < limit) { 708 return n + c - start; 709 } 710 n += limit - start; 711 } 712 } 713 714 /** 715 * Returns the character at the given index within this set, where 716 * the set is ordered by ascending code point. If the index is 717 * out of range, return (UChar32)-1. The inverse of this method is 718 * <code>indexOf()</code>. 719 * @param index an index from 0..size()-1 720 * @return the character at the given index, or (UChar32)-1. 721 */ 722 UChar32 UnicodeSet::charAt(int32_t index) const { 723 if (index >= 0) { 724 // len2 is the largest even integer <= len, that is, it is len 725 // for even values and len-1 for odd values. With odd values 726 // the last entry is UNICODESET_HIGH. 727 int32_t len2 = len & ~1; 728 for (int32_t i=0; i < len2;) { 729 UChar32 start = list[i++]; 730 int32_t count = list[i++] - start; 731 if (index < count) { 732 return static_cast<UChar32>(start + index); 733 } 734 index -= count; 735 } 736 } 737 return static_cast<UChar32>(-1); 738 } 739 740 /** 741 * Make this object represent the range <code>start - end</code>. 742 * If <code>end > start</code> then this object is set to an 743 * an empty range. 744 * 745 * @param start first character in the set, inclusive 746 * @rparam end last character in the set, inclusive 747 */ 748 UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) { 749 clear(); 750 complement(start, end); 751 return *this; 752 } 753 754 /** 755 * Adds the specified range to this set if it is not already 756 * present. If this set already contains the specified range, 757 * the call leaves this set unchanged. If <code>end > start</code> 758 * then an empty range is added, leaving the set unchanged. 759 * 760 * @param start first character, inclusive, of range to be added 761 * to this set. 762 * @param end last character, inclusive, of range to be added 763 * to this set. 764 */ 765 UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) { 766 if (pinCodePoint(start) < pinCodePoint(end)) { 767 UChar32 limit = end + 1; 768 // Fast path for adding a new range after the last one. 769 // Odd list length: [..., lastStart, lastLimit, HIGH] 770 if ((len & 1) != 0) { 771 // If the list is empty, set lastLimit low enough to not be adjacent to 0. 772 UChar32 lastLimit = len == 1 ? -2 : list[len - 2]; 773 if (lastLimit <= start && !isFrozen() && !isBogus()) { 774 if (lastLimit == start) { 775 // Extend the last range. 776 list[len - 2] = limit; 777 if (limit == UNICODESET_HIGH) { 778 --len; 779 } 780 } else { 781 list[len - 1] = start; 782 if (limit < UNICODESET_HIGH) { 783 if (ensureCapacity(len + 2)) { 784 list[len++] = limit; 785 list[len++] = UNICODESET_HIGH; 786 } 787 } else { // limit == UNICODESET_HIGH 788 if (ensureCapacity(len + 1)) { 789 list[len++] = UNICODESET_HIGH; 790 } 791 } 792 } 793 releasePattern(); 794 return *this; 795 } 796 } 797 // This is slow. Could be much faster using findCodePoint(start) 798 // and modifying the list, dealing with adjacent & overlapping ranges. 799 UChar32 range[3] = { start, limit, UNICODESET_HIGH }; 800 add(range, 2, 0); 801 } else if (start == end) { 802 add(start); 803 } 804 return *this; 805 } 806 807 // #define DEBUG_US_ADD 808 809 #ifdef DEBUG_US_ADD 810 #include <stdio.h> 811 void dump(UChar32 c) { 812 if (c <= 0xFF) { 813 printf("%c", (char)c); 814 } else { 815 printf("U+%04X", c); 816 } 817 } 818 void dump(const UChar32* list, int32_t len) { 819 printf("["); 820 for (int32_t i=0; i<len; ++i) { 821 if (i != 0) printf(", "); 822 dump(list[i]); 823 } 824 printf("]"); 825 } 826 #endif 827 828 /** 829 * Adds the specified character to this set if it is not already 830 * present. If this set already contains the specified character, 831 * the call leaves this set unchanged. 832 */ 833 UnicodeSet& UnicodeSet::add(UChar32 c) { 834 // find smallest i such that c < list[i] 835 // if odd, then it is IN the set 836 // if even, then it is OUT of the set 837 int32_t i = findCodePoint(pinCodePoint(c)); 838 839 // already in set? 840 if ((i & 1) != 0 || isFrozen() || isBogus()) return *this; 841 842 // HIGH is 0x110000 843 // assert(list[len-1] == HIGH); 844 845 // empty = [HIGH] 846 // [start_0, limit_0, start_1, limit_1, HIGH] 847 848 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] 849 // ^ 850 // list[i] 851 852 // i == 0 means c is before the first range 853 854 #ifdef DEBUG_US_ADD 855 printf("Add of "); 856 dump(c); 857 printf(" found at %d", i); 858 printf(": "); 859 dump(list, len); 860 printf(" => "); 861 #endif 862 863 if (c == list[i]-1) { 864 // c is before start of next range 865 list[i] = c; 866 // if we touched the HIGH mark, then add a new one 867 if (c == (UNICODESET_HIGH - 1)) { 868 if (!ensureCapacity(len+1)) { 869 // ensureCapacity will mark the object as Bogus if OOM failure happens. 870 return *this; 871 } 872 list[len++] = UNICODESET_HIGH; 873 } 874 if (i > 0 && c == list[i-1]) { 875 // collapse adjacent ranges 876 877 // [..., start_k-1, c, c, limit_k, ..., HIGH] 878 // ^ 879 // list[i] 880 881 //for (int32_t k=i-1; k<len-2; ++k) { 882 // list[k] = list[k+2]; 883 //} 884 UChar32* dst = list + i - 1; 885 UChar32* src = dst + 2; 886 UChar32* srclimit = list + len; 887 while (src < srclimit) *(dst++) = *(src++); 888 889 len -= 2; 890 } 891 } 892 893 else if (i > 0 && c == list[i-1]) { 894 // c is after end of prior range 895 list[i-1]++; 896 // no need to check for collapse here 897 } 898 899 else { 900 // At this point we know the new char is not adjacent to 901 // any existing ranges, and it is not 10FFFF. 902 903 904 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] 905 // ^ 906 // list[i] 907 908 // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH] 909 // ^ 910 // list[i] 911 912 if (!ensureCapacity(len+2)) { 913 // ensureCapacity will mark the object as Bogus if OOM failure happens. 914 return *this; 915 } 916 917 UChar32 *p = list + i; 918 uprv_memmove(p + 2, p, (len - i) * sizeof(*p)); 919 list[i] = c; 920 list[i+1] = c+1; 921 len += 2; 922 } 923 924 #ifdef DEBUG_US_ADD 925 dump(list, len); 926 printf("\n"); 927 928 for (i=1; i<len; ++i) { 929 if (list[i] <= list[i-1]) { 930 // Corrupt array! 931 printf("ERROR: list has been corrupted\n"); 932 exit(1); 933 } 934 } 935 #endif 936 937 releasePattern(); 938 return *this; 939 } 940 941 /** 942 * Adds the specified multicharacter to this set if it is not already 943 * present. If this set already contains the multicharacter, 944 * the call leaves this set unchanged. 945 * Thus "ch" => {"ch"} 946 * 947 * @param s the source string 948 * @return the modified set, for chaining 949 */ 950 UnicodeSet& UnicodeSet::add(const UnicodeString& s) { 951 if (isFrozen() || isBogus()) return *this; 952 int32_t cp = getSingleCP(s); 953 if (cp < 0) { 954 if (!stringsContains(s)) { 955 _add(s); 956 releasePattern(); 957 } 958 } else { 959 add(static_cast<UChar32>(cp)); 960 } 961 return *this; 962 } 963 964 /** 965 * Adds the given string, in order, to 'strings_'. The given string 966 * must have been checked by the caller to not already be in 'strings_'. 967 */ 968 void UnicodeSet::_add(const UnicodeString& s) { 969 if (isFrozen() || isBogus()) { 970 return; 971 } 972 UErrorCode ec = U_ZERO_ERROR; 973 if (strings_ == nullptr && !allocateStrings(ec)) { 974 setToBogus(); 975 return; 976 } 977 LocalPointer<UnicodeString> t(new UnicodeString(s)); 978 if (t.isNull()) { // Check for memory allocation error. 979 setToBogus(); 980 return; 981 } 982 strings_->sortedInsert(t.orphan(), compareUnicodeString, ec); 983 if (U_FAILURE(ec)) { 984 setToBogus(); 985 } 986 } 987 988 /** 989 * @return a code point IF the string consists of a single one. 990 * otherwise returns -1. 991 * @param string to test 992 */ 993 int32_t UnicodeSet::getSingleCP(const UnicodeString& s) { 994 int32_t sLength = s.length(); 995 if (sLength == 1) return s.charAt(0); 996 if (sLength == 2) { 997 UChar32 cp = s.char32At(0); 998 if (cp > 0xFFFF) { // is surrogate pair 999 return cp; 1000 } 1001 } 1002 return -1; 1003 } 1004 1005 /** 1006 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"} 1007 * If this set already any particular character, it has no effect on that character. 1008 * @param the source string 1009 * @return the modified set, for chaining 1010 */ 1011 UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) { 1012 UChar32 cp; 1013 for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) { 1014 cp = s.char32At(i); 1015 add(cp); 1016 } 1017 return *this; 1018 } 1019 1020 /** 1021 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"} 1022 * If this set already any particular character, it has no effect on that character. 1023 * @param the source string 1024 * @return the modified set, for chaining 1025 */ 1026 UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) { 1027 UnicodeSet set; 1028 set.addAll(s); 1029 retainAll(set); 1030 return *this; 1031 } 1032 1033 /** 1034 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"} 1035 * If this set already any particular character, it has no effect on that character. 1036 * @param the source string 1037 * @return the modified set, for chaining 1038 */ 1039 UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) { 1040 UnicodeSet set; 1041 set.addAll(s); 1042 complementAll(set); 1043 return *this; 1044 } 1045 1046 /** 1047 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"} 1048 * If this set already any particular character, it has no effect on that character. 1049 * @param the source string 1050 * @return the modified set, for chaining 1051 */ 1052 UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) { 1053 UnicodeSet set; 1054 set.addAll(s); 1055 removeAll(set); 1056 return *this; 1057 } 1058 1059 UnicodeSet& UnicodeSet::removeAllStrings() { 1060 if (!isFrozen() && hasStrings()) { 1061 strings_->removeAllElements(); 1062 releasePattern(); 1063 } 1064 return *this; 1065 } 1066 1067 1068 /** 1069 * Makes a set from a multicharacter string. Thus "ch" => {"ch"} 1070 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b> 1071 * @param the source string 1072 * @return a newly created set containing the given string 1073 */ 1074 UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) { 1075 UnicodeSet *set = new UnicodeSet(); 1076 if (set != nullptr) { // Check for memory allocation error. 1077 set->add(s); 1078 } 1079 return set; 1080 } 1081 1082 1083 /** 1084 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"} 1085 * @param the source string 1086 * @return a newly created set containing the given characters 1087 */ 1088 UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) { 1089 UnicodeSet *set = new UnicodeSet(); 1090 if (set != nullptr) { // Check for memory allocation error. 1091 set->addAll(s); 1092 } 1093 return set; 1094 } 1095 1096 /** 1097 * Retain only the elements in this set that are contained in the 1098 * specified range. If <code>end > start</code> then an empty range is 1099 * retained, leaving the set empty. 1100 * 1101 * @param start first character, inclusive, of range to be retained 1102 * to this set. 1103 * @param end last character, inclusive, of range to be retained 1104 * to this set. 1105 */ 1106 UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) { 1107 if (pinCodePoint(start) <= pinCodePoint(end)) { 1108 UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; 1109 retain(range, 2, 0); 1110 } else { 1111 clear(); 1112 } 1113 return *this; 1114 } 1115 1116 UnicodeSet& UnicodeSet::retain(UChar32 c) { 1117 return retain(c, c); 1118 } 1119 1120 UnicodeSet& UnicodeSet::retain(const UnicodeString &s) { 1121 if (isFrozen() || isBogus()) { return *this; } 1122 UChar32 cp = getSingleCP(s); 1123 if (cp < 0) { 1124 bool isIn = stringsContains(s); 1125 // Check for getRangeCount() first to avoid somewhat-expensive size() 1126 // when there are single code points. 1127 if (isIn && getRangeCount() == 0 && size() == 1) { 1128 return *this; 1129 } 1130 clear(); 1131 if (isIn) { 1132 _add(s); 1133 } 1134 } else { 1135 retain(cp, cp); 1136 } 1137 return *this; 1138 } 1139 1140 /** 1141 * Removes the specified range from this set if it is present. 1142 * The set will not contain the specified range once the call 1143 * returns. If <code>end > start</code> then an empty range is 1144 * removed, leaving the set unchanged. 1145 * 1146 * @param start first character, inclusive, of range to be removed 1147 * from this set. 1148 * @param end last character, inclusive, of range to be removed 1149 * from this set. 1150 */ 1151 UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) { 1152 if (pinCodePoint(start) <= pinCodePoint(end)) { 1153 UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; 1154 retain(range, 2, 2); 1155 } 1156 return *this; 1157 } 1158 1159 /** 1160 * Removes the specified character from this set if it is present. 1161 * The set will not contain the specified range once the call 1162 * returns. 1163 */ 1164 UnicodeSet& UnicodeSet::remove(UChar32 c) { 1165 return remove(c, c); 1166 } 1167 1168 /** 1169 * Removes the specified string from this set if it is present. 1170 * The set will not contain the specified character once the call 1171 * returns. 1172 * @param the source string 1173 * @return the modified set, for chaining 1174 */ 1175 UnicodeSet& UnicodeSet::remove(const UnicodeString& s) { 1176 if (isFrozen() || isBogus()) return *this; 1177 int32_t cp = getSingleCP(s); 1178 if (cp < 0) { 1179 if (strings_ != nullptr && strings_->removeElement((void*) &s)) { 1180 releasePattern(); 1181 } 1182 } else { 1183 remove(static_cast<UChar32>(cp), static_cast<UChar32>(cp)); 1184 } 1185 return *this; 1186 } 1187 1188 /** 1189 * Complements the specified range in this set. Any character in 1190 * the range will be removed if it is in this set, or will be 1191 * added if it is not in this set. If <code>end > start</code> 1192 * then an empty range is xor'ed, leaving the set unchanged. 1193 * 1194 * @param start first character, inclusive, of range to be removed 1195 * from this set. 1196 * @param end last character, inclusive, of range to be removed 1197 * from this set. 1198 */ 1199 UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) { 1200 if (isFrozen() || isBogus()) { 1201 return *this; 1202 } 1203 if (pinCodePoint(start) <= pinCodePoint(end)) { 1204 UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; 1205 exclusiveOr(range, 2, 0); 1206 } 1207 releasePattern(); 1208 return *this; 1209 } 1210 1211 UnicodeSet& UnicodeSet::complement(UChar32 c) { 1212 return complement(c, c); 1213 } 1214 1215 /** 1216 * This is equivalent to 1217 * <code>complement(MIN_VALUE, MAX_VALUE)</code>. 1218 */ 1219 UnicodeSet& UnicodeSet::complement() { 1220 if (isFrozen() || isBogus()) { 1221 return *this; 1222 } 1223 if (list[0] == UNICODESET_LOW) { 1224 uprv_memmove(list, list + 1, (size_t)(len-1)*sizeof(UChar32)); 1225 --len; 1226 } else { 1227 if (!ensureCapacity(len+1)) { 1228 return *this; 1229 } 1230 uprv_memmove(list + 1, list, (size_t)len*sizeof(UChar32)); 1231 list[0] = UNICODESET_LOW; 1232 ++len; 1233 } 1234 releasePattern(); 1235 return *this; 1236 } 1237 1238 /** 1239 * Complement the specified string in this set. 1240 * The set will not contain the specified string once the call 1241 * returns. 1242 * 1243 * @param s the string to complement 1244 * @return this object, for chaining 1245 */ 1246 UnicodeSet& UnicodeSet::complement(const UnicodeString& s) { 1247 if (isFrozen() || isBogus()) return *this; 1248 int32_t cp = getSingleCP(s); 1249 if (cp < 0) { 1250 if (stringsContains(s)) { 1251 strings_->removeElement((void*) &s); 1252 } else { 1253 _add(s); 1254 } 1255 releasePattern(); 1256 } else { 1257 complement(static_cast<UChar32>(cp), static_cast<UChar32>(cp)); 1258 } 1259 return *this; 1260 } 1261 1262 /** 1263 * Adds all of the elements in the specified set to this set if 1264 * they're not already present. This operation effectively 1265 * modifies this set so that its value is the <i>union</i> of the two 1266 * sets. The behavior of this operation is unspecified if the specified 1267 * collection is modified while the operation is in progress. 1268 * 1269 * @param c set whose elements are to be added to this set. 1270 * @see #add(char, char) 1271 */ 1272 UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) { 1273 if ( c.len>0 && c.list!=nullptr ) { 1274 add(c.list, c.len, 0); 1275 } 1276 1277 // Add strings in order 1278 if ( c.strings_!=nullptr ) { 1279 for (int32_t i=0; i<c.strings_->size(); ++i) { 1280 const UnicodeString* s = static_cast<const UnicodeString*>(c.strings_->elementAt(i)); 1281 if (!stringsContains(*s)) { 1282 _add(*s); 1283 } 1284 } 1285 } 1286 return *this; 1287 } 1288 1289 /** 1290 * Retains only the elements in this set that are contained in the 1291 * specified set. In other words, removes from this set all of 1292 * its elements that are not contained in the specified set. This 1293 * operation effectively modifies this set so that its value is 1294 * the <i>intersection</i> of the two sets. 1295 * 1296 * @param c set that defines which elements this set will retain. 1297 */ 1298 UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) { 1299 if (isFrozen() || isBogus()) { 1300 return *this; 1301 } 1302 retain(c.list, c.len, 0); 1303 if (hasStrings()) { 1304 if (!c.hasStrings()) { 1305 strings_->removeAllElements(); 1306 } else { 1307 strings_->retainAll(*c.strings_); 1308 } 1309 } 1310 return *this; 1311 } 1312 1313 /** 1314 * Removes from this set all of its elements that are contained in the 1315 * specified set. This operation effectively modifies this 1316 * set so that its value is the <i>asymmetric set difference</i> of 1317 * the two sets. 1318 * 1319 * @param c set that defines which elements will be removed from 1320 * this set. 1321 */ 1322 UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) { 1323 if (isFrozen() || isBogus()) { 1324 return *this; 1325 } 1326 retain(c.list, c.len, 2); 1327 if (hasStrings() && c.hasStrings()) { 1328 strings_->removeAll(*c.strings_); 1329 } 1330 return *this; 1331 } 1332 1333 /** 1334 * Complements in this set all elements contained in the specified 1335 * set. Any character in the other set will be removed if it is 1336 * in this set, or will be added if it is not in this set. 1337 * 1338 * @param c set that defines which elements will be xor'ed from 1339 * this set. 1340 */ 1341 UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) { 1342 if (isFrozen() || isBogus()) { 1343 return *this; 1344 } 1345 exclusiveOr(c.list, c.len, 0); 1346 1347 if (c.strings_ != nullptr) { 1348 for (int32_t i=0; i<c.strings_->size(); ++i) { 1349 void* e = c.strings_->elementAt(i); 1350 if (strings_ == nullptr || !strings_->removeElement(e)) { 1351 _add(*static_cast<const UnicodeString*>(e)); 1352 } 1353 } 1354 } 1355 return *this; 1356 } 1357 1358 /** 1359 * Removes all of the elements from this set. This set will be 1360 * empty after this call returns. 1361 */ 1362 UnicodeSet& UnicodeSet::clear() { 1363 if (isFrozen()) { 1364 return *this; 1365 } 1366 list[0] = UNICODESET_HIGH; 1367 len = 1; 1368 releasePattern(); 1369 if (strings_ != nullptr) { 1370 strings_->removeAllElements(); 1371 } 1372 // Remove bogus 1373 fFlags = 0; 1374 return *this; 1375 } 1376 1377 /** 1378 * Iteration method that returns the number of ranges contained in 1379 * this set. 1380 * @see #getRangeStart 1381 * @see #getRangeEnd 1382 */ 1383 int32_t UnicodeSet::getRangeCount() const { 1384 return len/2; 1385 } 1386 1387 /** 1388 * Iteration method that returns the first character in the 1389 * specified range of this set. 1390 * @see #getRangeCount 1391 * @see #getRangeEnd 1392 */ 1393 UChar32 UnicodeSet::getRangeStart(int32_t index) const { 1394 return list[index*2]; 1395 } 1396 1397 /** 1398 * Iteration method that returns the last character in the 1399 * specified range of this set. 1400 * @see #getRangeStart 1401 * @see #getRangeEnd 1402 */ 1403 UChar32 UnicodeSet::getRangeEnd(int32_t index) const { 1404 return list[index*2 + 1] - 1; 1405 } 1406 1407 const UnicodeString* UnicodeSet::getString(int32_t index) const { 1408 return static_cast<const UnicodeString*>(strings_->elementAt(index)); 1409 } 1410 1411 /** 1412 * Reallocate this objects internal structures to take up the least 1413 * possible space, without changing this object's value. 1414 */ 1415 UnicodeSet& UnicodeSet::compact() { 1416 if (isFrozen() || isBogus()) { 1417 return *this; 1418 } 1419 // Delete buffer first to defragment memory less. 1420 if (buffer != stackList) { 1421 uprv_free(buffer); 1422 buffer = nullptr; 1423 bufferCapacity = 0; 1424 } 1425 if (list == stackList) { 1426 // pass 1427 } else if (len <= INITIAL_CAPACITY) { 1428 uprv_memcpy(stackList, list, len * sizeof(UChar32)); 1429 uprv_free(list); 1430 list = stackList; 1431 capacity = INITIAL_CAPACITY; 1432 } else if ((len + 7) < capacity) { 1433 // If we have more than a little unused capacity, shrink it to len. 1434 UChar32* temp = static_cast<UChar32*>(uprv_realloc(list, sizeof(UChar32) * len)); 1435 if (temp) { 1436 list = temp; 1437 capacity = len; 1438 } 1439 // else what the heck happened?! We allocated less memory! 1440 // Oh well. We'll keep our original array. 1441 } 1442 if (strings_ != nullptr && strings_->isEmpty()) { 1443 delete strings_; 1444 strings_ = nullptr; 1445 } 1446 return *this; 1447 } 1448 1449 #ifdef DEBUG_SERIALIZE 1450 #include <stdio.h> 1451 #endif 1452 1453 /** 1454 * Deserialize constructor. 1455 */ 1456 UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization, 1457 UErrorCode &ec) { 1458 1459 if(U_FAILURE(ec)) { 1460 setToBogus(); 1461 return; 1462 } 1463 1464 if( (serialization != kSerialized) 1465 || (data==nullptr) 1466 || (dataLen < 1)) { 1467 ec = U_ILLEGAL_ARGUMENT_ERROR; 1468 setToBogus(); 1469 return; 1470 } 1471 1472 // bmp? 1473 int32_t headerSize = ((data[0]&0x8000)) ?2:1; 1474 int32_t bmpLength = (headerSize==1)?data[0]:data[1]; 1475 1476 int32_t newLength = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength; 1477 #ifdef DEBUG_SERIALIZE 1478 printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,newLength, data[0],data[1],data[2],data[3]); 1479 #endif 1480 if(!ensureCapacity(newLength + 1)) { // +1 for HIGH 1481 return; 1482 } 1483 // copy bmp 1484 int32_t i; 1485 for(i = 0; i< bmpLength;i++) { 1486 list[i] = data[i+headerSize]; 1487 #ifdef DEBUG_SERIALIZE 1488 printf("<<16@%d[%d] %X\n", i+headerSize, i, list[i]); 1489 #endif 1490 } 1491 // copy smp 1492 for(i=bmpLength;i<newLength;i++) { 1493 list[i] = (static_cast<UChar32>(data[headerSize + bmpLength + (i - bmpLength) * 2 + 0]) << 16) + 1494 static_cast<UChar32>(data[headerSize + bmpLength + (i - bmpLength) * 2 + 1]); 1495 #ifdef DEBUG_SERIALIZE 1496 printf("<<32@%d+[%d] %lX\n", headerSize+bmpLength+i, i, list[i]); 1497 #endif 1498 } 1499 U_ASSERT(i == newLength); 1500 if (i == 0 || list[i - 1] != UNICODESET_HIGH) { 1501 list[i++] = UNICODESET_HIGH; 1502 } 1503 len = i; 1504 } 1505 1506 1507 int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const { 1508 int32_t bmpLength, length, destLength; 1509 1510 if (U_FAILURE(ec)) { 1511 return 0; 1512 } 1513 1514 if (destCapacity<0 || (destCapacity>0 && dest==nullptr)) { 1515 ec=U_ILLEGAL_ARGUMENT_ERROR; 1516 return 0; 1517 } 1518 1519 /* count necessary 16-bit units */ 1520 length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH 1521 // assert(length>=0); 1522 if (length==0) { 1523 /* empty set */ 1524 if (destCapacity>0) { 1525 *dest=0; 1526 } else { 1527 ec=U_BUFFER_OVERFLOW_ERROR; 1528 } 1529 return 1; 1530 } 1531 /* now length>0 */ 1532 1533 if (this->list[length-1]<=0xffff) { 1534 /* all BMP */ 1535 bmpLength=length; 1536 } else if (this->list[0]>=0x10000) { 1537 /* all supplementary */ 1538 bmpLength=0; 1539 length*=2; 1540 } else { 1541 /* some BMP, some supplementary */ 1542 for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {} 1543 length=bmpLength+2*(length-bmpLength); 1544 } 1545 #ifdef DEBUG_SERIALIZE 1546 printf(">> bmpLength%d length%d len%d\n", bmpLength, length, len); 1547 #endif 1548 /* length: number of 16-bit array units */ 1549 if (length>0x7fff) { 1550 /* there are only 15 bits for the length in the first serialized word */ 1551 ec=U_INDEX_OUTOFBOUNDS_ERROR; 1552 return 0; 1553 } 1554 1555 /* 1556 * total serialized length: 1557 * number of 16-bit array units (length) + 1558 * 1 length unit (always) + 1559 * 1 bmpLength unit (if there are supplementary values) 1560 */ 1561 destLength=length+((length>bmpLength)?2:1); 1562 if (destLength<=destCapacity) { 1563 const UChar32 *p; 1564 int32_t i; 1565 1566 #ifdef DEBUG_SERIALIZE 1567 printf("writeHdr\n"); 1568 #endif 1569 *dest = static_cast<uint16_t>(length); 1570 if (length>bmpLength) { 1571 *dest|=0x8000; 1572 *++dest = static_cast<uint16_t>(bmpLength); 1573 } 1574 ++dest; 1575 1576 /* write the BMP part of the array */ 1577 p=this->list; 1578 for (i=0; i<bmpLength; ++i) { 1579 #ifdef DEBUG_SERIALIZE 1580 printf("writebmp: %x\n", (int)*p); 1581 #endif 1582 *dest++ = static_cast<uint16_t>(*p++); 1583 } 1584 1585 /* write the supplementary part of the array */ 1586 for (; i<length; i+=2) { 1587 #ifdef DEBUG_SERIALIZE 1588 printf("write32: %x\n", (int)*p); 1589 #endif 1590 *dest++ = static_cast<uint16_t>(*p >> 16); 1591 *dest++ = static_cast<uint16_t>(*p++); 1592 } 1593 } else { 1594 ec=U_BUFFER_OVERFLOW_ERROR; 1595 } 1596 return destLength; 1597 } 1598 1599 //---------------------------------------------------------------- 1600 // Implementation: Utility methods 1601 //---------------------------------------------------------------- 1602 1603 /** 1604 * Allocate our strings vector and return true if successful. 1605 */ 1606 UBool UnicodeSet::allocateStrings(UErrorCode &status) { 1607 if (U_FAILURE(status)) { 1608 return false; 1609 } 1610 strings_ = new UVector(uprv_deleteUObject, 1611 uhash_compareUnicodeString, 1, status); 1612 if (strings_ == nullptr) { // Check for memory allocation error. 1613 status = U_MEMORY_ALLOCATION_ERROR; 1614 return false; 1615 } 1616 if (U_FAILURE(status)) { 1617 delete strings_; 1618 strings_ = nullptr; 1619 return false; 1620 } 1621 return true; 1622 } 1623 1624 int32_t UnicodeSet::nextCapacity(int32_t minCapacity) { 1625 // Grow exponentially to reduce the frequency of allocations. 1626 if (minCapacity < INITIAL_CAPACITY) { 1627 return minCapacity + INITIAL_CAPACITY; 1628 } else if (minCapacity <= 2500) { 1629 return 5 * minCapacity; 1630 } else { 1631 int32_t newCapacity = 2 * minCapacity; 1632 if (newCapacity > MAX_LENGTH) { 1633 newCapacity = MAX_LENGTH; 1634 } 1635 return newCapacity; 1636 } 1637 } 1638 1639 bool UnicodeSet::ensureCapacity(int32_t newLen) { 1640 if (newLen > MAX_LENGTH) { 1641 newLen = MAX_LENGTH; 1642 } 1643 if (newLen <= capacity) { 1644 return true; 1645 } 1646 int32_t newCapacity = nextCapacity(newLen); 1647 UChar32* temp = static_cast<UChar32*>(uprv_malloc(newCapacity * sizeof(UChar32))); 1648 if (temp == nullptr) { 1649 setToBogus(); // set the object to bogus state if an OOM failure occurred. 1650 return false; 1651 } 1652 // Copy only the actual contents. 1653 uprv_memcpy(temp, list, len * sizeof(UChar32)); 1654 if (list != stackList) { 1655 uprv_free(list); 1656 } 1657 list = temp; 1658 capacity = newCapacity; 1659 return true; 1660 } 1661 1662 bool UnicodeSet::ensureBufferCapacity(int32_t newLen) { 1663 if (newLen > MAX_LENGTH) { 1664 newLen = MAX_LENGTH; 1665 } 1666 if (newLen <= bufferCapacity) { 1667 return true; 1668 } 1669 int32_t newCapacity = nextCapacity(newLen); 1670 UChar32* temp = static_cast<UChar32*>(uprv_malloc(newCapacity * sizeof(UChar32))); 1671 if (temp == nullptr) { 1672 setToBogus(); 1673 return false; 1674 } 1675 // The buffer has no contents to be copied. 1676 // It is always filled from scratch after this call. 1677 if (buffer != stackList) { 1678 uprv_free(buffer); 1679 } 1680 buffer = temp; 1681 bufferCapacity = newCapacity; 1682 return true; 1683 } 1684 1685 /** 1686 * Swap list and buffer. 1687 */ 1688 void UnicodeSet::swapBuffers() { 1689 // swap list and buffer 1690 UChar32* temp = list; 1691 list = buffer; 1692 buffer = temp; 1693 1694 int32_t c = capacity; 1695 capacity = bufferCapacity; 1696 bufferCapacity = c; 1697 } 1698 1699 void UnicodeSet::setToBogus() { 1700 clear(); // Remove everything in the set. 1701 fFlags = kIsBogus; 1702 } 1703 1704 //---------------------------------------------------------------- 1705 // Implementation: Fundamental operators 1706 //---------------------------------------------------------------- 1707 1708 static inline UChar32 max(UChar32 a, UChar32 b) { 1709 return (a > b) ? a : b; 1710 } 1711 1712 // polarity = 0, 3 is normal: x xor y 1713 // polarity = 1, 2: x xor ~y == x === y 1714 1715 void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) { 1716 if (isFrozen() || isBogus()) { 1717 return; 1718 } 1719 if (!ensureBufferCapacity(len + otherLen)) { 1720 return; 1721 } 1722 1723 int32_t i = 0, j = 0, k = 0; 1724 UChar32 a = list[i++]; 1725 UChar32 b; 1726 if (polarity == 1 || polarity == 2) { 1727 b = UNICODESET_LOW; 1728 if (other[j] == UNICODESET_LOW) { // skip base if already LOW 1729 ++j; 1730 b = other[j]; 1731 } 1732 } else { 1733 b = other[j++]; 1734 } 1735 // simplest of all the routines 1736 // sort the values, discarding identicals! 1737 for (;;) { 1738 if (a < b) { 1739 buffer[k++] = a; 1740 a = list[i++]; 1741 } else if (b < a) { 1742 buffer[k++] = b; 1743 b = other[j++]; 1744 } else if (a != UNICODESET_HIGH) { // at this point, a == b 1745 // discard both values! 1746 a = list[i++]; 1747 b = other[j++]; 1748 } else { // DONE! 1749 buffer[k++] = UNICODESET_HIGH; 1750 len = k; 1751 break; 1752 } 1753 } 1754 swapBuffers(); 1755 releasePattern(); 1756 } 1757 1758 // polarity = 0 is normal: x union y 1759 // polarity = 2: x union ~y 1760 // polarity = 1: ~x union y 1761 // polarity = 3: ~x union ~y 1762 1763 void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) { 1764 if (isFrozen() || isBogus() || other==nullptr) { 1765 return; 1766 } 1767 if (!ensureBufferCapacity(len + otherLen)) { 1768 return; 1769 } 1770 1771 int32_t i = 0, j = 0, k = 0; 1772 UChar32 a = list[i++]; 1773 UChar32 b = other[j++]; 1774 // change from xor is that we have to check overlapping pairs 1775 // polarity bit 1 means a is second, bit 2 means b is. 1776 for (;;) { 1777 switch (polarity) { 1778 case 0: // both first; take lower if unequal 1779 if (a < b) { // take a 1780 // Back up over overlapping ranges in buffer[] 1781 if (k > 0 && a <= buffer[k-1]) { 1782 // Pick latter end value in buffer[] vs. list[] 1783 a = max(list[i], buffer[--k]); 1784 } else { 1785 // No overlap 1786 buffer[k++] = a; 1787 a = list[i]; 1788 } 1789 i++; // Common if/else code factored out 1790 polarity ^= 1; 1791 } else if (b < a) { // take b 1792 if (k > 0 && b <= buffer[k-1]) { 1793 b = max(other[j], buffer[--k]); 1794 } else { 1795 buffer[k++] = b; 1796 b = other[j]; 1797 } 1798 j++; 1799 polarity ^= 2; 1800 } else { // a == b, take a, drop b 1801 if (a == UNICODESET_HIGH) goto loop_end; 1802 // This is symmetrical; it doesn't matter if 1803 // we backtrack with a or b. - liu 1804 if (k > 0 && a <= buffer[k-1]) { 1805 a = max(list[i], buffer[--k]); 1806 } else { 1807 // No overlap 1808 buffer[k++] = a; 1809 a = list[i]; 1810 } 1811 i++; 1812 polarity ^= 1; 1813 b = other[j++]; 1814 polarity ^= 2; 1815 } 1816 break; 1817 case 3: // both second; take higher if unequal, and drop other 1818 if (b <= a) { // take a 1819 if (a == UNICODESET_HIGH) goto loop_end; 1820 buffer[k++] = a; 1821 } else { // take b 1822 if (b == UNICODESET_HIGH) goto loop_end; 1823 buffer[k++] = b; 1824 } 1825 a = list[i++]; 1826 polarity ^= 1; // factored common code 1827 b = other[j++]; 1828 polarity ^= 2; 1829 break; 1830 case 1: // a second, b first; if b < a, overlap 1831 if (a < b) { // no overlap, take a 1832 buffer[k++] = a; a = list[i++]; polarity ^= 1; 1833 } else if (b < a) { // OVERLAP, drop b 1834 b = other[j++]; 1835 polarity ^= 2; 1836 } else { // a == b, drop both! 1837 if (a == UNICODESET_HIGH) goto loop_end; 1838 a = list[i++]; 1839 polarity ^= 1; 1840 b = other[j++]; 1841 polarity ^= 2; 1842 } 1843 break; 1844 case 2: // a first, b second; if a < b, overlap 1845 if (b < a) { // no overlap, take b 1846 buffer[k++] = b; 1847 b = other[j++]; 1848 polarity ^= 2; 1849 } else if (a < b) { // OVERLAP, drop a 1850 a = list[i++]; 1851 polarity ^= 1; 1852 } else { // a == b, drop both! 1853 if (a == UNICODESET_HIGH) goto loop_end; 1854 a = list[i++]; 1855 polarity ^= 1; 1856 b = other[j++]; 1857 polarity ^= 2; 1858 } 1859 break; 1860 } 1861 } 1862 loop_end: 1863 buffer[k++] = UNICODESET_HIGH; // terminate 1864 len = k; 1865 swapBuffers(); 1866 releasePattern(); 1867 } 1868 1869 // polarity = 0 is normal: x intersect y 1870 // polarity = 2: x intersect ~y == set-minus 1871 // polarity = 1: ~x intersect y 1872 // polarity = 3: ~x intersect ~y 1873 1874 void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) { 1875 if (isFrozen() || isBogus()) { 1876 return; 1877 } 1878 if (!ensureBufferCapacity(len + otherLen)) { 1879 return; 1880 } 1881 1882 int32_t i = 0, j = 0, k = 0; 1883 UChar32 a = list[i++]; 1884 UChar32 b = other[j++]; 1885 // change from xor is that we have to check overlapping pairs 1886 // polarity bit 1 means a is second, bit 2 means b is. 1887 for (;;) { 1888 switch (polarity) { 1889 case 0: // both first; drop the smaller 1890 if (a < b) { // drop a 1891 a = list[i++]; 1892 polarity ^= 1; 1893 } else if (b < a) { // drop b 1894 b = other[j++]; 1895 polarity ^= 2; 1896 } else { // a == b, take one, drop other 1897 if (a == UNICODESET_HIGH) goto loop_end; 1898 buffer[k++] = a; 1899 a = list[i++]; 1900 polarity ^= 1; 1901 b = other[j++]; 1902 polarity ^= 2; 1903 } 1904 break; 1905 case 3: // both second; take lower if unequal 1906 if (a < b) { // take a 1907 buffer[k++] = a; 1908 a = list[i++]; 1909 polarity ^= 1; 1910 } else if (b < a) { // take b 1911 buffer[k++] = b; 1912 b = other[j++]; 1913 polarity ^= 2; 1914 } else { // a == b, take one, drop other 1915 if (a == UNICODESET_HIGH) goto loop_end; 1916 buffer[k++] = a; 1917 a = list[i++]; 1918 polarity ^= 1; 1919 b = other[j++]; 1920 polarity ^= 2; 1921 } 1922 break; 1923 case 1: // a second, b first; 1924 if (a < b) { // NO OVERLAP, drop a 1925 a = list[i++]; 1926 polarity ^= 1; 1927 } else if (b < a) { // OVERLAP, take b 1928 buffer[k++] = b; 1929 b = other[j++]; 1930 polarity ^= 2; 1931 } else { // a == b, drop both! 1932 if (a == UNICODESET_HIGH) goto loop_end; 1933 a = list[i++]; 1934 polarity ^= 1; 1935 b = other[j++]; 1936 polarity ^= 2; 1937 } 1938 break; 1939 case 2: // a first, b second; if a < b, overlap 1940 if (b < a) { // no overlap, drop b 1941 b = other[j++]; 1942 polarity ^= 2; 1943 } else if (a < b) { // OVERLAP, take a 1944 buffer[k++] = a; 1945 a = list[i++]; 1946 polarity ^= 1; 1947 } else { // a == b, drop both! 1948 if (a == UNICODESET_HIGH) goto loop_end; 1949 a = list[i++]; 1950 polarity ^= 1; 1951 b = other[j++]; 1952 polarity ^= 2; 1953 } 1954 break; 1955 } 1956 } 1957 loop_end: 1958 buffer[k++] = UNICODESET_HIGH; // terminate 1959 len = k; 1960 swapBuffers(); 1961 releasePattern(); 1962 } 1963 1964 /** 1965 * Append the <code>toPattern()</code> representation of a 1966 * string to the given <code>StringBuffer</code>. 1967 */ 1968 void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable) { 1969 UChar32 cp; 1970 for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) { 1971 _appendToPat(buf, cp = s.char32At(i), escapeUnprintable); 1972 } 1973 } 1974 1975 /** 1976 * Append the <code>toPattern()</code> representation of a 1977 * character to the given <code>StringBuffer</code>. 1978 */ 1979 void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable) { 1980 if (escapeUnprintable ? ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) { 1981 // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything 1982 // unprintable 1983 ICU_Utility::escape(buf, c); 1984 return; 1985 } 1986 // Okay to let ':' pass through 1987 switch (c) { 1988 case u'[': 1989 case u']': 1990 case u'-': 1991 case u'^': 1992 case u'&': 1993 case u'\\': 1994 case u'{': 1995 case u'}': 1996 case u':': 1997 case SymbolTable::SYMBOL_REF: 1998 buf.append(u'\\'); 1999 break; 2000 default: 2001 // Escape whitespace 2002 if (PatternProps::isWhiteSpace(c)) { 2003 buf.append(u'\\'); 2004 } 2005 break; 2006 } 2007 buf.append(c); 2008 } 2009 2010 void UnicodeSet::_appendToPat(UnicodeString &result, UChar32 start, UChar32 end, 2011 UBool escapeUnprintable) { 2012 _appendToPat(result, start, escapeUnprintable); 2013 if (start != end) { 2014 if ((start+1) != end || 2015 // Avoid writing what looks like a lead+trail surrogate pair. 2016 start == 0xdbff) { 2017 result.append(u'-'); 2018 } 2019 _appendToPat(result, end, escapeUnprintable); 2020 } 2021 } 2022 2023 /** 2024 * Append a string representation of this set to result. This will be 2025 * a cleaned version of the string passed to applyPattern(), if there 2026 * is one. Otherwise it will be generated. 2027 */ 2028 UnicodeString& UnicodeSet::_toPattern(UnicodeString& result, 2029 UBool escapeUnprintable) const 2030 { 2031 if (pat != nullptr) { 2032 int32_t i; 2033 int32_t backslashCount = 0; 2034 for (i=0; i<patLen; ) { 2035 UChar32 c; 2036 U16_NEXT(pat, i, patLen, c); 2037 if (escapeUnprintable ? 2038 ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) { 2039 // If the unprintable character is preceded by an odd 2040 // number of backslashes, then it has been escaped. 2041 // Before unescaping it, we delete the final 2042 // backslash. 2043 if ((backslashCount % 2) == 1) { 2044 result.truncate(result.length() - 1); 2045 } 2046 ICU_Utility::escape(result, c); 2047 backslashCount = 0; 2048 } else { 2049 result.append(c); 2050 if (c == u'\\') { 2051 ++backslashCount; 2052 } else { 2053 backslashCount = 0; 2054 } 2055 } 2056 } 2057 return result; 2058 } 2059 2060 return _generatePattern(result, escapeUnprintable); 2061 } 2062 2063 /** 2064 * Returns a string representation of this set. If the result of 2065 * calling this function is passed to a UnicodeSet constructor, it 2066 * will produce another set that is equal to this one. 2067 */ 2068 UnicodeString& UnicodeSet::toPattern(UnicodeString& result, 2069 UBool escapeUnprintable) const 2070 { 2071 result.truncate(0); 2072 return _toPattern(result, escapeUnprintable); 2073 } 2074 2075 /** 2076 * Generate and append a string representation of this set to result. 2077 * This does not use this.pat, the cleaned up copy of the string 2078 * passed to applyPattern(). 2079 */ 2080 UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result, 2081 UBool escapeUnprintable) const 2082 { 2083 result.append(u'['); 2084 2085 int32_t i = 0; 2086 int32_t limit = len & ~1; // = 2 * getRangeCount() 2087 2088 // If the set contains at least 2 intervals and includes both 2089 // MIN_VALUE and MAX_VALUE, then the inverse representation will 2090 // be more economical. 2091 // if (getRangeCount() >= 2 && 2092 // getRangeStart(0) == MIN_VALUE && 2093 // getRangeEnd(last) == MAX_VALUE) 2094 // Invariant: list[len-1] == HIGH == MAX_VALUE + 1 2095 // If limit == len then len is even and the last range ends with MAX_VALUE. 2096 // 2097 // *But* do not write the inverse (complement) if there are strings. 2098 // Since ICU 70, the '^' performs a code point complement which removes all strings. 2099 if (len >= 4 && list[0] == 0 && limit == len && !hasStrings()) { 2100 // Emit the inverse 2101 result.append(u'^'); 2102 // Offsetting the inversion list index by one lets us 2103 // iterate over the ranges of the set complement. 2104 i = 1; 2105 --limit; 2106 } 2107 2108 // Emit the ranges as pairs. 2109 while (i < limit) { 2110 UChar32 start = list[i]; // getRangeStart() 2111 UChar32 end = list[i + 1] - 1; // getRangeEnd() = range limit minus one 2112 if (!(0xd800 <= end && end <= 0xdbff)) { 2113 _appendToPat(result, start, end, escapeUnprintable); 2114 i += 2; 2115 } else { 2116 // The range ends with a lead surrogate. 2117 // Avoid writing what looks like a lead+trail surrogate pair. 2118 // 1. Postpone ranges that start with a lead surrogate code point. 2119 int32_t firstLead = i; 2120 while ((i += 2) < limit && list[i] <= 0xdbff) {} 2121 int32_t firstAfterLead = i; 2122 // 2. Write following ranges that start with a trail surrogate code point. 2123 while (i < limit && (start = list[i]) <= 0xdfff) { 2124 _appendToPat(result, start, list[i + 1] - 1, escapeUnprintable); 2125 i += 2; 2126 } 2127 // 3. Now write the postponed ranges. 2128 for (int j = firstLead; j < firstAfterLead; j += 2) { 2129 _appendToPat(result, list[j], list[j + 1] - 1, escapeUnprintable); 2130 } 2131 } 2132 } 2133 2134 if (strings_ != nullptr) { 2135 for (int32_t i = 0; i<strings_->size(); ++i) { 2136 result.append(u'{'); 2137 _appendToPat(result, 2138 *static_cast<const UnicodeString*>(strings_->elementAt(i)), 2139 escapeUnprintable); 2140 result.append(u'}'); 2141 } 2142 } 2143 return result.append(u']'); 2144 } 2145 2146 /** 2147 * Release existing cached pattern 2148 */ 2149 void UnicodeSet::releasePattern() { 2150 if (pat) { 2151 uprv_free(pat); 2152 pat = nullptr; 2153 patLen = 0; 2154 } 2155 } 2156 2157 /** 2158 * Set the new pattern to cache. 2159 */ 2160 void UnicodeSet::setPattern(const char16_t *newPat, int32_t newPatLen) { 2161 releasePattern(); 2162 pat = static_cast<char16_t*>(uprv_malloc((newPatLen + 1) * sizeof(char16_t))); 2163 if (pat) { 2164 patLen = newPatLen; 2165 u_memcpy(pat, newPat, patLen); 2166 pat[patLen] = 0; 2167 } 2168 // else we don't care if malloc failed. This was just a nice cache. 2169 // We can regenerate an equivalent pattern later when requested. 2170 } 2171 2172 UnicodeSet *UnicodeSet::freeze() { 2173 if(!isFrozen() && !isBogus()) { 2174 compact(); 2175 2176 // Optimize contains() and span() and similar functions. 2177 if (hasStrings()) { 2178 stringSpan = new UnicodeSetStringSpan(*this, *strings_, UnicodeSetStringSpan::ALL); 2179 if (stringSpan == nullptr) { 2180 setToBogus(); 2181 return this; 2182 } else if (!stringSpan->needsStringSpanUTF16()) { 2183 // All strings are irrelevant for span() etc. because 2184 // all of each string's code points are contained in this set. 2185 // Do not check needsStringSpanUTF8() because UTF-8 has at most as 2186 // many relevant strings as UTF-16. 2187 // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().) 2188 delete stringSpan; 2189 stringSpan = nullptr; 2190 } 2191 } 2192 if (stringSpan == nullptr) { 2193 // No span-relevant strings: Optimize for code point spans. 2194 bmpSet=new BMPSet(list, len); 2195 if (bmpSet == nullptr) { // Check for memory allocation error. 2196 setToBogus(); 2197 } 2198 } 2199 } 2200 return this; 2201 } 2202 2203 int32_t UnicodeSet::span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const { 2204 if(length>0 && bmpSet!=nullptr) { 2205 return static_cast<int32_t>(bmpSet->span(s, s + length, spanCondition) - s); 2206 } 2207 if(length<0) { 2208 length=u_strlen(s); 2209 } 2210 if(length==0) { 2211 return 0; 2212 } 2213 if(stringSpan!=nullptr) { 2214 return stringSpan->span(s, length, spanCondition); 2215 } else if(hasStrings()) { 2216 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? 2217 UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED : 2218 UnicodeSetStringSpan::FWD_UTF16_CONTAINED; 2219 UnicodeSetStringSpan strSpan(*this, *strings_, which); 2220 if(strSpan.needsStringSpanUTF16()) { 2221 return strSpan.span(s, length, spanCondition); 2222 } 2223 } 2224 2225 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2226 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2227 } 2228 2229 UChar32 c; 2230 int32_t start=0, prev=0; 2231 do { 2232 U16_NEXT(s, start, length, c); 2233 if(spanCondition!=contains(c)) { 2234 break; 2235 } 2236 } while((prev=start)<length); 2237 return prev; 2238 } 2239 2240 int32_t UnicodeSet::spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const { 2241 if(length>0 && bmpSet!=nullptr) { 2242 return static_cast<int32_t>(bmpSet->spanBack(s, s + length, spanCondition) - s); 2243 } 2244 if(length<0) { 2245 length=u_strlen(s); 2246 } 2247 if(length==0) { 2248 return 0; 2249 } 2250 if(stringSpan!=nullptr) { 2251 return stringSpan->spanBack(s, length, spanCondition); 2252 } else if(hasStrings()) { 2253 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? 2254 UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED : 2255 UnicodeSetStringSpan::BACK_UTF16_CONTAINED; 2256 UnicodeSetStringSpan strSpan(*this, *strings_, which); 2257 if(strSpan.needsStringSpanUTF16()) { 2258 return strSpan.spanBack(s, length, spanCondition); 2259 } 2260 } 2261 2262 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2263 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2264 } 2265 2266 UChar32 c; 2267 int32_t prev=length; 2268 do { 2269 U16_PREV(s, 0, length, c); 2270 if(spanCondition!=contains(c)) { 2271 break; 2272 } 2273 } while((prev=length)>0); 2274 return prev; 2275 } 2276 2277 int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const { 2278 if(length>0 && bmpSet!=nullptr) { 2279 const uint8_t* s0 = reinterpret_cast<const uint8_t*>(s); 2280 return static_cast<int32_t>(bmpSet->spanUTF8(s0, length, spanCondition) - s0); 2281 } 2282 if(length<0) { 2283 length = static_cast<int32_t>(uprv_strlen(s)); 2284 } 2285 if(length==0) { 2286 return 0; 2287 } 2288 if(stringSpan!=nullptr) { 2289 return stringSpan->spanUTF8(reinterpret_cast<const uint8_t*>(s), length, spanCondition); 2290 } else if(hasStrings()) { 2291 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? 2292 UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED : 2293 UnicodeSetStringSpan::FWD_UTF8_CONTAINED; 2294 UnicodeSetStringSpan strSpan(*this, *strings_, which); 2295 if(strSpan.needsStringSpanUTF8()) { 2296 return strSpan.spanUTF8(reinterpret_cast<const uint8_t*>(s), length, spanCondition); 2297 } 2298 } 2299 2300 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2301 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2302 } 2303 2304 UChar32 c; 2305 int32_t start=0, prev=0; 2306 do { 2307 U8_NEXT_OR_FFFD(s, start, length, c); 2308 if(spanCondition!=contains(c)) { 2309 break; 2310 } 2311 } while((prev=start)<length); 2312 return prev; 2313 } 2314 2315 int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const { 2316 if(length>0 && bmpSet!=nullptr) { 2317 const uint8_t* s0 = reinterpret_cast<const uint8_t*>(s); 2318 return bmpSet->spanBackUTF8(s0, length, spanCondition); 2319 } 2320 if(length<0) { 2321 length = static_cast<int32_t>(uprv_strlen(s)); 2322 } 2323 if(length==0) { 2324 return 0; 2325 } 2326 if(stringSpan!=nullptr) { 2327 return stringSpan->spanBackUTF8(reinterpret_cast<const uint8_t*>(s), length, spanCondition); 2328 } else if(hasStrings()) { 2329 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? 2330 UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED : 2331 UnicodeSetStringSpan::BACK_UTF8_CONTAINED; 2332 UnicodeSetStringSpan strSpan(*this, *strings_, which); 2333 if(strSpan.needsStringSpanUTF8()) { 2334 return strSpan.spanBackUTF8(reinterpret_cast<const uint8_t*>(s), length, spanCondition); 2335 } 2336 } 2337 2338 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2339 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2340 } 2341 2342 UChar32 c; 2343 int32_t prev=length; 2344 do { 2345 U8_PREV_OR_FFFD(s, 0, length, c); 2346 if(spanCondition!=contains(c)) { 2347 break; 2348 } 2349 } while((prev=length)>0); 2350 return prev; 2351 } 2352 2353 U_NAMESPACE_END