uniset.h (71860B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 *************************************************************************** 5 * Copyright (C) 1999-2016, International Business Machines Corporation 6 * and others. All Rights Reserved. 7 *************************************************************************** 8 * Date Name Description 9 * 10/20/99 alan Creation. 10 *************************************************************************** 11 */ 12 13 #ifndef UNICODESET_H 14 #define UNICODESET_H 15 16 #include "unicode/utypes.h" 17 18 #if U_SHOW_CPLUSPLUS_API 19 20 #include "unicode/ucpmap.h" 21 #include "unicode/unifilt.h" 22 #include "unicode/unistr.h" 23 #include "unicode/uset.h" 24 25 /** 26 * \file 27 * \brief C++ API: Unicode Set 28 */ 29 30 U_NAMESPACE_BEGIN 31 32 // Forward Declarations. 33 class BMPSet; 34 class ParsePosition; 35 class RBBIRuleScanner; 36 class SymbolTable; 37 class UnicodeSetStringSpan; 38 class UVector; 39 class RuleCharacterIterator; 40 41 /** 42 * A mutable set of Unicode characters and multicharacter strings. Objects of this class 43 * represent <em>character classes</em> used in regular expressions. 44 * A character specifies a subset of Unicode code points. Legal 45 * code points are U+0000 to U+10FFFF, inclusive. 46 * 47 * <p>The UnicodeSet class is not designed to be subclassed. 48 * 49 * <p><code>UnicodeSet</code> supports two APIs. The first is the 50 * <em>operand</em> API that allows the caller to modify the value of 51 * a <code>UnicodeSet</code> object. It conforms to Java 2's 52 * <code>java.util.Set</code> interface, although 53 * <code>UnicodeSet</code> does not actually implement that 54 * interface. All methods of <code>Set</code> are supported, with the 55 * modification that they take a character range or single character 56 * instead of an <code>Object</code>, and they take a 57 * <code>UnicodeSet</code> instead of a <code>Collection</code>. The 58 * operand API may be thought of in terms of boolean logic: a boolean 59 * OR is implemented by <code>add</code>, a boolean AND is implemented 60 * by <code>retain</code>, a boolean XOR is implemented by 61 * <code>complement</code> taking an argument, and a boolean NOT is 62 * implemented by <code>complement</code> with no argument. In terms 63 * of traditional set theory function names, <code>add</code> is a 64 * union, <code>retain</code> is an intersection, <code>remove</code> 65 * is an asymmetric difference, and <code>complement</code> with no 66 * argument is a set complement with respect to the superset range 67 * <code>MIN_VALUE-MAX_VALUE</code> 68 * 69 * <p>The second API is the 70 * <code>applyPattern()</code>/<code>toPattern()</code> API from the 71 * <code>java.text.Format</code>-derived classes. Unlike the 72 * methods that add characters, add categories, and control the logic 73 * of the set, the method <code>applyPattern()</code> sets all 74 * attributes of a <code>UnicodeSet</code> at once, based on a 75 * string pattern. 76 * 77 * <p><b>Pattern syntax</b></p> 78 * 79 * Patterns are accepted by the constructors and the 80 * <code>applyPattern()</code> methods and returned by the 81 * <code>toPattern()</code> method. These patterns follow a syntax 82 * similar to that employed by version 8 regular expression character 83 * classes. Here are some simple examples: 84 * 85 * \htmlonly<blockquote>\endhtmlonly 86 * <table> 87 * <tr align="top"> 88 * <td nowrap valign="top" align="left"><code>[]</code></td> 89 * <td valign="top">No characters</td> 90 * </tr><tr align="top"> 91 * <td nowrap valign="top" align="left"><code>[a]</code></td> 92 * <td valign="top">The character 'a'</td> 93 * </tr><tr align="top"> 94 * <td nowrap valign="top" align="left"><code>[ae]</code></td> 95 * <td valign="top">The characters 'a' and 'e'</td> 96 * </tr> 97 * <tr> 98 * <td nowrap valign="top" align="left"><code>[a-e]</code></td> 99 * <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code 100 * point order</td> 101 * </tr> 102 * <tr> 103 * <td nowrap valign="top" align="left"><code>[\\u4E01]</code></td> 104 * <td valign="top">The character U+4E01</td> 105 * </tr> 106 * <tr> 107 * <td nowrap valign="top" align="left"><code>[a{ab}{ac}]</code></td> 108 * <td valign="top">The character 'a' and the multicharacter strings "ab" and 109 * "ac"</td> 110 * </tr> 111 * <tr> 112 * <td nowrap valign="top" align="left"><code>[\\p{Lu}]</code></td> 113 * <td valign="top">All characters in the general category Uppercase Letter</td> 114 * </tr> 115 * </table> 116 * \htmlonly</blockquote>\endhtmlonly 117 * 118 * Any character may be preceded by a backslash in order to remove any special 119 * meaning. White space characters, as defined by UCharacter.isWhitespace(), are 120 * ignored, unless they are escaped. 121 * 122 * <p>Property patterns specify a set of characters having a certain 123 * property as defined by the Unicode standard. Both the POSIX-like 124 * "[:Lu:]" and the Perl-like syntax "\\p{Lu}" are recognized. For a 125 * complete list of supported property patterns, see the User's Guide 126 * for UnicodeSet at 127 * <a href="https://unicode-org.github.io/icu/userguide/strings/unicodeset"> 128 * https://unicode-org.github.io/icu/userguide/strings/unicodeset</a>. 129 * Actual determination of property data is defined by the underlying 130 * Unicode database as implemented by UCharacter. 131 * 132 * <p>Patterns specify individual characters, ranges of characters, and 133 * Unicode property sets. When elements are concatenated, they 134 * specify their union. To complement a set, place a '^' immediately 135 * after the opening '['. Property patterns are inverted by modifying 136 * their delimiters; "[:^foo]" and "\\P{foo}". In any other location, 137 * '^' has no special meaning. 138 * 139 * <p>Since ICU 70, "[^...]", "[:^foo]", "\\P{foo}", and "[:binaryProperty=No:]" 140 * perform a “code point complement” (all code points minus the original set), 141 * removing all multicharacter strings, 142 * equivalent to <code>.complement().removeAllStrings()</code>. 143 * The complement() API function continues to perform a 144 * symmetric difference with all code points and thus retains all multicharacter strings. 145 * 146 * <p>Ranges are indicated by placing two a '-' between two 147 * characters, as in "a-z". This specifies the range of all 148 * characters from the left to the right, in Unicode order. If the 149 * left character is greater than or equal to the 150 * right character it is a syntax error. If a '-' occurs as the first 151 * character after the opening '[' or '[^', or if it occurs as the 152 * last character before the closing ']', then it is taken as a 153 * literal. Thus "[a\-b]", "[-ab]", and "[ab-]" all indicate the same 154 * set of three characters, 'a', 'b', and '-'. 155 * 156 * <p>Sets may be intersected using the '&' operator or the asymmetric 157 * set difference may be taken using the '-' operator, for example, 158 * "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters 159 * with values less than 4096. Operators ('&' and '|') have equal 160 * precedence and bind left-to-right. Thus 161 * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to 162 * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]". This only really matters for 163 * difference; intersection is commutative. 164 * 165 * <table> 166 * <tr valign=top><td nowrap><code>[a]</code><td>The set containing 'a' 167 * <tr valign=top><td nowrap><code>[a-z]</code><td>The set containing 'a' 168 * through 'z' and all letters in between, in Unicode order 169 * <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing 170 * all characters but 'a' through 'z', 171 * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF 172 * <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code> 173 * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em> 174 * <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code> 175 * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em> 176 * <tr valign=top><td nowrap><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code> 177 * <td>The asymmetric difference of sets specified by <em>pat1</em> and 178 * <em>pat2</em> 179 * <tr valign=top><td nowrap><code>[:Lu:] or \\p{Lu}</code> 180 * <td>The set of characters having the specified 181 * Unicode property; in 182 * this case, Unicode uppercase letters 183 * <tr valign=top><td nowrap><code>[:^Lu:] or \\P{Lu}</code> 184 * <td>The set of characters <em>not</em> having the given 185 * Unicode property 186 * </table> 187 * 188 * <p><b>Formal syntax</b></p> 189 * 190 * \htmlonly<blockquote>\endhtmlonly 191 * <table> 192 * <tr align="top"> 193 * <td nowrap valign="top" align="right"><code>pattern := </code></td> 194 * <td valign="top"><code>('[' '^'? item* ']') | 195 * property</code></td> 196 * </tr> 197 * <tr align="top"> 198 * <td nowrap valign="top" align="right"><code>item := </code></td> 199 * <td valign="top"><code>char | (char '-' char) | pattern-expr<br> 200 * </code></td> 201 * </tr> 202 * <tr align="top"> 203 * <td nowrap valign="top" align="right"><code>pattern-expr := </code></td> 204 * <td valign="top"><code>pattern | pattern-expr pattern | 205 * pattern-expr op pattern<br> 206 * </code></td> 207 * </tr> 208 * <tr align="top"> 209 * <td nowrap valign="top" align="right"><code>op := </code></td> 210 * <td valign="top"><code>'&' | '-'<br> 211 * </code></td> 212 * </tr> 213 * <tr align="top"> 214 * <td nowrap valign="top" align="right"><code>special := </code></td> 215 * <td valign="top"><code>'[' | ']' | '-'<br> 216 * </code></td> 217 * </tr> 218 * <tr align="top"> 219 * <td nowrap valign="top" align="right"><code>char := </code></td> 220 * <td valign="top"><em>any character that is not</em><code> special<br> 221 * | ('\' </code><em>any character</em><code>)<br> 222 * | ('\\u' hex hex hex hex)<br> 223 * </code></td> 224 * </tr> 225 * <tr align="top"> 226 * <td nowrap valign="top" align="right"><code>hex := </code></td> 227 * <td valign="top"><code>'0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' |<br> 228 * 'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'a' | 'b' | 'c' | 'd' | 'e' | 'f'</code></td> 229 * </tr> 230 * <tr> 231 * <td nowrap valign="top" align="right"><code>property := </code></td> 232 * <td valign="top"><em>a Unicode property set pattern</em></td> 233 * </tr> 234 * </table> 235 * <br> 236 * <table border="1"> 237 * <tr> 238 * <td>Legend: <table> 239 * <tr> 240 * <td nowrap valign="top"><code>a := b</code></td> 241 * <td width="20" valign="top"> </td> 242 * <td valign="top"><code>a</code> may be replaced by <code>b</code> </td> 243 * </tr> 244 * <tr> 245 * <td nowrap valign="top"><code>a?</code></td> 246 * <td valign="top"></td> 247 * <td valign="top">zero or one instance of <code>a</code><br> 248 * </td> 249 * </tr> 250 * <tr> 251 * <td nowrap valign="top"><code>a*</code></td> 252 * <td valign="top"></td> 253 * <td valign="top">one or more instances of <code>a</code><br> 254 * </td> 255 * </tr> 256 * <tr> 257 * <td nowrap valign="top"><code>a | b</code></td> 258 * <td valign="top"></td> 259 * <td valign="top">either <code>a</code> or <code>b</code><br> 260 * </td> 261 * </tr> 262 * <tr> 263 * <td nowrap valign="top"><code>'a'</code></td> 264 * <td valign="top"></td> 265 * <td valign="top">the literal string between the quotes </td> 266 * </tr> 267 * </table> 268 * </td> 269 * </tr> 270 * </table> 271 * \htmlonly</blockquote>\endhtmlonly 272 * 273 * <p>Note: 274 * - Most UnicodeSet methods do not take a UErrorCode parameter because 275 * there are usually very few opportunities for failure other than a shortage 276 * of memory, error codes in low-level C++ string methods would be inconvenient, 277 * and the error code as the last parameter (ICU convention) would prevent 278 * the use of default parameter values. 279 * Instead, such methods set the UnicodeSet into a "bogus" state 280 * (see isBogus()) if an error occurs. 281 * 282 * @author Alan Liu 283 * @stable ICU 2.0 284 */ 285 class U_COMMON_API UnicodeSet final : public UnicodeFilter { 286 private: 287 /** 288 * Enough for sets with few ranges. 289 * For example, White_Space has 10 ranges, list length 21. 290 */ 291 static constexpr int32_t INITIAL_CAPACITY = 25; 292 // fFlags constant 293 static constexpr uint8_t kIsBogus = 1; // This set is bogus (i.e. not valid) 294 295 UChar32* list = stackList; // MUST be terminated with HIGH 296 int32_t capacity = INITIAL_CAPACITY; // capacity of list 297 int32_t len = 1; // length of list used; 1 <= len <= capacity 298 uint8_t fFlags = 0; // Bit flag (see constants above) 299 300 BMPSet *bmpSet = nullptr; // The set is frozen iff either bmpSet or stringSpan is not nullptr. 301 UChar32* buffer = nullptr; // internal buffer, may be nullptr 302 int32_t bufferCapacity = 0; // capacity of buffer 303 304 /** 305 * The pattern representation of this set. This may not be the 306 * most economical pattern. It is the pattern supplied to 307 * applyPattern(), with variables substituted and whitespace 308 * removed. For sets constructed without applyPattern(), or 309 * modified using the non-pattern API, this string will be empty, 310 * indicating that toPattern() must generate a pattern 311 * representation from the inversion list. 312 */ 313 char16_t *pat = nullptr; 314 int32_t patLen = 0; 315 316 UVector* strings_ = nullptr; // maintained in sorted order 317 UnicodeSetStringSpan *stringSpan = nullptr; 318 319 /** 320 * Initial list array. 321 * Avoids some heap allocations, and list is never nullptr. 322 * Increases the object size a bit. 323 */ 324 UChar32 stackList[INITIAL_CAPACITY]; 325 326 public: 327 /** 328 * Determine if this object contains a valid set. 329 * A bogus set has no value. It is different from an empty set. 330 * It can be used to indicate that no set value is available. 331 * 332 * @return true if the set is bogus/invalid, false otherwise 333 * @see setToBogus() 334 * @stable ICU 4.0 335 */ 336 inline UBool isBogus() const; 337 338 /** 339 * Make this UnicodeSet object invalid. 340 * The string will test true with isBogus(). 341 * 342 * A bogus set has no value. It is different from an empty set. 343 * It can be used to indicate that no set value is available. 344 * 345 * This utility function is used throughout the UnicodeSet 346 * implementation to indicate that a UnicodeSet operation failed, 347 * and may be used in other functions, 348 * especially but not exclusively when such functions do not 349 * take a UErrorCode for simplicity. 350 * 351 * @see isBogus() 352 * @stable ICU 4.0 353 */ 354 void setToBogus(); 355 356 public: 357 358 enum { 359 /** 360 * Minimum value that can be stored in a UnicodeSet. 361 * @stable ICU 2.4 362 */ 363 MIN_VALUE = 0, 364 365 /** 366 * Maximum value that can be stored in a UnicodeSet. 367 * @stable ICU 2.4 368 */ 369 MAX_VALUE = 0x10ffff 370 }; 371 372 //---------------------------------------------------------------- 373 // Constructors &c 374 //---------------------------------------------------------------- 375 376 public: 377 378 /** 379 * Constructs an empty set. 380 * @stable ICU 2.0 381 */ 382 UnicodeSet(); 383 384 /** 385 * Constructs a set containing the given range. If <code>end < 386 * start</code> then an empty set is created. 387 * 388 * @param start first character, inclusive, of range 389 * @param end last character, inclusive, of range 390 * @stable ICU 2.4 391 */ 392 UnicodeSet(UChar32 start, UChar32 end); 393 394 #ifndef U_HIDE_INTERNAL_API 395 /** 396 * @internal 397 */ 398 enum ESerialization { 399 kSerialized /* result of serialize() */ 400 }; 401 402 /** 403 * Constructs a set from the output of serialize(). 404 * 405 * @param buffer the 16 bit array 406 * @param bufferLen the original length returned from serialize() 407 * @param serialization the value 'kSerialized' 408 * @param status error code 409 * 410 * @internal 411 */ 412 UnicodeSet(const uint16_t buffer[], int32_t bufferLen, 413 ESerialization serialization, UErrorCode &status); 414 #endif /* U_HIDE_INTERNAL_API */ 415 416 /** 417 * Constructs a set from the given pattern. See the class 418 * description for the syntax of the pattern language. 419 * @param pattern a string specifying what characters are in the set 420 * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern 421 * contains a syntax error. 422 * @stable ICU 2.0 423 */ 424 UnicodeSet(const UnicodeString& pattern, 425 UErrorCode& status); 426 427 #ifndef U_HIDE_INTERNAL_API 428 /** 429 * Constructs a set from the given pattern. See the class 430 * description for the syntax of the pattern language. 431 * @param pattern a string specifying what characters are in the set 432 * @param options bitmask for options to apply to the pattern. 433 * Valid options are USET_IGNORE_SPACE and 434 * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE. 435 * These case options are mutually exclusive. 436 * @param symbols a symbol table mapping variable names to values 437 * and stand-in characters to UnicodeSets; may be nullptr 438 * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern 439 * contains a syntax error. 440 * @internal 441 */ 442 UnicodeSet(const UnicodeString& pattern, 443 uint32_t options, 444 const SymbolTable* symbols, 445 UErrorCode& status); 446 #endif /* U_HIDE_INTERNAL_API */ 447 448 /** 449 * Constructs a set from the given pattern. See the class description 450 * for the syntax of the pattern language. 451 * @param pattern a string specifying what characters are in the set 452 * @param pos on input, the position in pattern at which to start parsing. 453 * On output, the position after the last character parsed. 454 * @param options bitmask for options to apply to the pattern. 455 * Valid options are USET_IGNORE_SPACE and 456 * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE. 457 * These case options are mutually exclusive. 458 * @param symbols a symbol table mapping variable names to values 459 * and stand-in characters to UnicodeSets; may be nullptr 460 * @param status input-output error code 461 * @stable ICU 2.8 462 */ 463 UnicodeSet(const UnicodeString& pattern, ParsePosition& pos, 464 uint32_t options, 465 const SymbolTable* symbols, 466 UErrorCode& status); 467 468 /** 469 * Constructs a set that is identical to the given UnicodeSet. 470 * @stable ICU 2.0 471 */ 472 UnicodeSet(const UnicodeSet& o); 473 474 /** 475 * Destructs the set. 476 * @stable ICU 2.0 477 */ 478 virtual ~UnicodeSet(); 479 480 /** 481 * Assigns this object to be a copy of another. 482 * A frozen set will not be modified. 483 * @stable ICU 2.0 484 */ 485 UnicodeSet& operator=(const UnicodeSet& o); 486 487 /** 488 * Compares the specified object with this set for equality. Returns 489 * <tt>true</tt> if the two sets 490 * have the same size, and every member of the specified set is 491 * contained in this set (or equivalently, every member of this set is 492 * contained in the specified set). 493 * 494 * @param o set to be compared for equality with this set. 495 * @return <tt>true</tt> if the specified set is equal to this set. 496 * @stable ICU 2.0 497 */ 498 bool operator==(const UnicodeSet& o) const; 499 500 /** 501 * Compares the specified object with this set for equality. Returns 502 * <tt>true</tt> if the specified set is not equal to this set. 503 * @stable ICU 2.0 504 */ 505 inline bool operator!=(const UnicodeSet& o) const; 506 507 /** 508 * Returns a copy of this object. All UnicodeFunctor objects have 509 * to support cloning in order to allow classes using 510 * UnicodeFunctors, such as Transliterator, to implement cloning. 511 * If this set is frozen, then the clone will be frozen as well. 512 * Use cloneAsThawed() for a mutable clone of a frozen set. 513 * @see cloneAsThawed 514 * @stable ICU 2.0 515 */ 516 virtual UnicodeSet* clone() const override; 517 518 /** 519 * Returns the hash code value for this set. 520 * 521 * @return the hash code value for this set. 522 * @see Object#hashCode() 523 * @stable ICU 2.0 524 */ 525 int32_t hashCode() const; 526 527 /** 528 * Get a UnicodeSet pointer from a USet 529 * 530 * @param uset a USet (the ICU plain C type for UnicodeSet) 531 * @return the corresponding UnicodeSet pointer. 532 * 533 * @stable ICU 4.2 534 */ 535 inline static UnicodeSet *fromUSet(USet *uset); 536 537 /** 538 * Get a UnicodeSet pointer from a const USet 539 * 540 * @param uset a const USet (the ICU plain C type for UnicodeSet) 541 * @return the corresponding UnicodeSet pointer. 542 * 543 * @stable ICU 4.2 544 */ 545 inline static const UnicodeSet *fromUSet(const USet *uset); 546 547 /** 548 * Produce a USet * pointer for this UnicodeSet. 549 * USet is the plain C type for UnicodeSet 550 * 551 * @return a USet pointer for this UnicodeSet 552 * @stable ICU 4.2 553 */ 554 inline USet *toUSet(); 555 556 557 /** 558 * Produce a const USet * pointer for this UnicodeSet. 559 * USet is the plain C type for UnicodeSet 560 * 561 * @return a const USet pointer for this UnicodeSet 562 * @stable ICU 4.2 563 */ 564 inline const USet * toUSet() const; 565 566 567 //---------------------------------------------------------------- 568 // Freezable API 569 //---------------------------------------------------------------- 570 571 /** 572 * Determines whether the set has been frozen (made immutable) or not. 573 * See the ICU4J Freezable interface for details. 574 * @return true/false for whether the set has been frozen 575 * @see freeze 576 * @see cloneAsThawed 577 * @stable ICU 3.8 578 */ 579 inline UBool isFrozen() const; 580 581 /** 582 * Freeze the set (make it immutable). 583 * Once frozen, it cannot be unfrozen and is therefore thread-safe 584 * until it is deleted. 585 * See the ICU4J Freezable interface for details. 586 * Freezing the set may also make some operations faster, for example 587 * contains() and span(). 588 * A frozen set will not be modified. (It remains frozen.) 589 * @return this set. 590 * @see isFrozen 591 * @see cloneAsThawed 592 * @stable ICU 3.8 593 */ 594 UnicodeSet *freeze(); 595 596 /** 597 * Clone the set and make the clone mutable. 598 * See the ICU4J Freezable interface for details. 599 * @return the mutable clone 600 * @see freeze 601 * @see isFrozen 602 * @stable ICU 3.8 603 */ 604 UnicodeSet *cloneAsThawed() const; 605 606 //---------------------------------------------------------------- 607 // Public API 608 //---------------------------------------------------------------- 609 610 /** 611 * Make this object represent the range `start - end`. 612 * If `start > end` then this object is set to an empty range. 613 * A frozen set will not be modified. 614 * 615 * @param start first character in the set, inclusive 616 * @param end last character in the set, inclusive 617 * @stable ICU 2.4 618 */ 619 UnicodeSet& set(UChar32 start, UChar32 end); 620 621 /** 622 * Return true if the given position, in the given pattern, appears 623 * to be the start of a UnicodeSet pattern. 624 * @stable ICU 2.4 625 */ 626 static UBool resemblesPattern(const UnicodeString& pattern, 627 int32_t pos); 628 629 /** 630 * Modifies this set to represent the set specified by the given 631 * pattern, ignoring Unicode Pattern_White_Space characters. 632 * See the class description for the syntax of the pattern language. 633 * A frozen set will not be modified. 634 * @param pattern a string specifying what characters are in the set 635 * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern 636 * contains a syntax error. 637 * <em> Empties the set passed before applying the pattern.</em> 638 * @return a reference to this 639 * @stable ICU 2.0 640 */ 641 UnicodeSet& applyPattern(const UnicodeString& pattern, 642 UErrorCode& status); 643 644 #ifndef U_HIDE_INTERNAL_API 645 /** 646 * Modifies this set to represent the set specified by the given 647 * pattern, optionally ignoring Unicode Pattern_White_Space characters. 648 * See the class description for the syntax of the pattern language. 649 * A frozen set will not be modified. 650 * @param pattern a string specifying what characters are in the set 651 * @param options bitmask for options to apply to the pattern. 652 * Valid options are USET_IGNORE_SPACE and 653 * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE. 654 * These case options are mutually exclusive. 655 * @param symbols a symbol table mapping variable names to 656 * values and stand-ins to UnicodeSets; may be nullptr 657 * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern 658 * contains a syntax error. 659 *<em> Empties the set passed before applying the pattern.</em> 660 * @return a reference to this 661 * @internal 662 */ 663 UnicodeSet& applyPattern(const UnicodeString& pattern, 664 uint32_t options, 665 const SymbolTable* symbols, 666 UErrorCode& status); 667 #endif /* U_HIDE_INTERNAL_API */ 668 669 /** 670 * Parses the given pattern, starting at the given position. The 671 * character at pattern.charAt(pos.getIndex()) must be '[', or the 672 * parse fails. Parsing continues until the corresponding closing 673 * ']'. If a syntax error is encountered between the opening and 674 * closing brace, the parse fails. Upon return from a successful 675 * parse, the ParsePosition is updated to point to the character 676 * following the closing ']', and a StringBuffer containing a 677 * pairs list for the parsed pattern is returned. This method calls 678 * itself recursively to parse embedded subpatterns. 679 *<em> Empties the set passed before applying the pattern.</em> 680 * A frozen set will not be modified. 681 * 682 * @param pattern the string containing the pattern to be parsed. 683 * The portion of the string from pos.getIndex(), which must be a 684 * '[', to the corresponding closing ']', is parsed. 685 * @param pos upon entry, the position at which to being parsing. 686 * The character at pattern.charAt(pos.getIndex()) must be a '['. 687 * Upon return from a successful parse, pos.getIndex() is either 688 * the character after the closing ']' of the parsed pattern, or 689 * pattern.length() if the closing ']' is the last character of 690 * the pattern string. 691 * @param options bitmask for options to apply to the pattern. 692 * Valid options are USET_IGNORE_SPACE and 693 * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE. 694 * These case options are mutually exclusive. 695 * @param symbols a symbol table mapping variable names to 696 * values and stand-ins to UnicodeSets; may be nullptr 697 * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern 698 * contains a syntax error. 699 * @return a reference to this 700 * @stable ICU 2.8 701 */ 702 UnicodeSet& applyPattern(const UnicodeString& pattern, 703 ParsePosition& pos, 704 uint32_t options, 705 const SymbolTable* symbols, 706 UErrorCode& status); 707 708 /** 709 * Returns a string representation of this set. If the result of 710 * calling this function is passed to a UnicodeSet constructor, it 711 * will produce another set that is equal to this one. 712 * A frozen set will not be modified. 713 * @param result the string to receive the rules. Previous 714 * contents will be deleted. 715 * @param escapeUnprintable if true then convert unprintable 716 * character to their hex escape representations, \\uxxxx or 717 * \\Uxxxxxxxx. Unprintable characters are those other than 718 * U+000A, U+0020..U+007E. 719 * @stable ICU 2.0 720 */ 721 virtual UnicodeString& toPattern(UnicodeString& result, 722 UBool escapeUnprintable = false) const override; 723 724 /** 725 * Modifies this set to contain those code points which have the given value 726 * for the given binary or enumerated property, as returned by 727 * u_getIntPropertyValue. Prior contents of this set are lost. 728 * A frozen set will not be modified. 729 * 730 * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1 731 * or UCHAR_INT_START..UCHAR_INT_LIMIT-1 732 * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1. 733 * 734 * @param value a value in the range u_getIntPropertyMinValue(prop).. 735 * u_getIntPropertyMaxValue(prop), with one exception. If prop is 736 * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but 737 * rather a mask value produced by U_GET_GC_MASK(). This allows grouped 738 * categories such as [:L:] to be represented. 739 * 740 * @param ec error code input/output parameter 741 * 742 * @return a reference to this set 743 * 744 * @stable ICU 2.4 745 */ 746 UnicodeSet& applyIntPropertyValue(UProperty prop, 747 int32_t value, 748 UErrorCode& ec); 749 750 /** 751 * Modifies this set to contain those code points which have the 752 * given value for the given property. Prior contents of this 753 * set are lost. 754 * A frozen set will not be modified. 755 * 756 * @param prop a property alias, either short or long. The name is matched 757 * loosely. See PropertyAliases.txt for names and a description of loose 758 * matching. If the value string is empty, then this string is interpreted 759 * as either a General_Category value alias, a Script value alias, a binary 760 * property alias, or a special ID. Special IDs are matched loosely and 761 * correspond to the following sets: 762 * 763 * "ANY" = [\\u0000-\\U0010FFFF], 764 * "ASCII" = [\\u0000-\\u007F], 765 * "Assigned" = [:^Cn:]. 766 * 767 * @param value a value alias, either short or long. The name is matched 768 * loosely. See PropertyValueAliases.txt for names and a description of 769 * loose matching. In addition to aliases listed, numeric values and 770 * canonical combining classes may be expressed numerically, e.g., ("nv", 771 * "0.5") or ("ccc", "220"). The value string may also be empty. 772 * 773 * @param ec error code input/output parameter 774 * 775 * @return a reference to this set 776 * 777 * @stable ICU 2.4 778 */ 779 UnicodeSet& applyPropertyAlias(const UnicodeString& prop, 780 const UnicodeString& value, 781 UErrorCode& ec); 782 783 /** 784 * Returns the number of elements in this set (its cardinality). 785 * Note than the elements of a set may include both individual 786 * codepoints and strings. 787 * 788 * This is slower than getRangeCount() because 789 * it counts the code points of all ranges. 790 * 791 * @return the number of elements in this set (its cardinality). 792 * @stable ICU 2.0 793 * @see getRangeCount 794 */ 795 int32_t size() const; 796 797 /** 798 * Returns <tt>true</tt> if this set contains no elements. 799 * 800 * @return <tt>true</tt> if this set contains no elements. 801 * @stable ICU 2.0 802 */ 803 UBool isEmpty() const; 804 805 /** 806 * @return true if this set contains multi-character strings or the empty string. 807 * @stable ICU 70 808 */ 809 UBool hasStrings() const; 810 811 /** 812 * Returns true if this set contains the given character. 813 * This function works faster with a frozen set. 814 * @param c character to be checked for containment 815 * @return true if the test condition is met 816 * @stable ICU 2.0 817 */ 818 virtual UBool contains(UChar32 c) const override; 819 820 /** 821 * Returns true if this set contains every character 822 * of the given range. 823 * @param start first character, inclusive, of the range 824 * @param end last character, inclusive, of the range 825 * @return true if the test condition is met 826 * @stable ICU 2.0 827 */ 828 UBool contains(UChar32 start, UChar32 end) const; 829 830 /** 831 * Returns <tt>true</tt> if this set contains the given 832 * multicharacter string. 833 * @param s string to be checked for containment 834 * @return <tt>true</tt> if this set contains the specified string 835 * @stable ICU 2.4 836 */ 837 UBool contains(const UnicodeString& s) const; 838 839 /** 840 * Returns true if this set contains all the characters and strings 841 * of the given set. 842 * @param c set to be checked for containment 843 * @return true if the test condition is met 844 * @stable ICU 2.4 845 */ 846 UBool containsAll(const UnicodeSet& c) const; 847 848 /** 849 * Returns true if this set contains all the characters 850 * of the given string. 851 * @param s string containing characters to be checked for containment 852 * @return true if the test condition is met 853 * @stable ICU 2.4 854 */ 855 UBool containsAll(const UnicodeString& s) const; 856 857 /** 858 * Returns true if this set contains none of the characters 859 * of the given range. 860 * @param start first character, inclusive, of the range 861 * @param end last character, inclusive, of the range 862 * @return true if the test condition is met 863 * @stable ICU 2.4 864 */ 865 UBool containsNone(UChar32 start, UChar32 end) const; 866 867 /** 868 * Returns true if this set contains none of the characters and strings 869 * of the given set. 870 * @param c set to be checked for containment 871 * @return true if the test condition is met 872 * @stable ICU 2.4 873 */ 874 UBool containsNone(const UnicodeSet& c) const; 875 876 /** 877 * Returns true if this set contains none of the characters 878 * of the given string. 879 * @param s string containing characters to be checked for containment 880 * @return true if the test condition is met 881 * @stable ICU 2.4 882 */ 883 UBool containsNone(const UnicodeString& s) const; 884 885 /** 886 * Returns true if this set contains one or more of the characters 887 * in the given range. 888 * @param start first character, inclusive, of the range 889 * @param end last character, inclusive, of the range 890 * @return true if the condition is met 891 * @stable ICU 2.4 892 */ 893 inline UBool containsSome(UChar32 start, UChar32 end) const; 894 895 /** 896 * Returns true if this set contains one or more of the characters 897 * and strings of the given set. 898 * @param s The set to be checked for containment 899 * @return true if the condition is met 900 * @stable ICU 2.4 901 */ 902 inline UBool containsSome(const UnicodeSet& s) const; 903 904 /** 905 * Returns true if this set contains one or more of the characters 906 * of the given string. 907 * @param s string containing characters to be checked for containment 908 * @return true if the condition is met 909 * @stable ICU 2.4 910 */ 911 inline UBool containsSome(const UnicodeString& s) const; 912 913 /** 914 * Returns the length of the initial substring of the input string which 915 * consists only of characters and strings that are contained in this set 916 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 917 * or only of characters and strings that are not contained 918 * in this set (USET_SPAN_NOT_CONTAINED). 919 * See USetSpanCondition for details. 920 * Similar to the strspn() C library function. 921 * Unpaired surrogates are treated according to contains() of their surrogate code points. 922 * This function works faster with a frozen set and with a non-negative string length argument. 923 * @param s start of the string 924 * @param length of the string; can be -1 for NUL-terminated 925 * @param spanCondition specifies the containment condition 926 * @return the length of the initial substring according to the spanCondition; 927 * 0 if the start of the string does not fit the spanCondition 928 * @stable ICU 3.8 929 * @see USetSpanCondition 930 */ 931 int32_t span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const; 932 933 /** 934 * Returns the end of the substring of the input string according to the USetSpanCondition. 935 * Same as <code>start+span(s.getBuffer()+start, s.length()-start, spanCondition)</code> 936 * after pinning start to 0<=start<=s.length(). 937 * @param s the string 938 * @param start the start index in the string for the span operation 939 * @param spanCondition specifies the containment condition 940 * @return the exclusive end of the substring according to the spanCondition; 941 * the substring s.tempSubStringBetween(start, end) fulfills the spanCondition 942 * @stable ICU 4.4 943 * @see USetSpanCondition 944 */ 945 inline int32_t span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const; 946 947 /** 948 * Returns the start of the trailing substring of the input string which 949 * consists only of characters and strings that are contained in this set 950 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 951 * or only of characters and strings that are not contained 952 * in this set (USET_SPAN_NOT_CONTAINED). 953 * See USetSpanCondition for details. 954 * Unpaired surrogates are treated according to contains() of their surrogate code points. 955 * This function works faster with a frozen set and with a non-negative string length argument. 956 * @param s start of the string 957 * @param length of the string; can be -1 for NUL-terminated 958 * @param spanCondition specifies the containment condition 959 * @return the start of the trailing substring according to the spanCondition; 960 * the string length if the end of the string does not fit the spanCondition 961 * @stable ICU 3.8 962 * @see USetSpanCondition 963 */ 964 int32_t spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const; 965 966 /** 967 * Returns the start of the substring of the input string according to the USetSpanCondition. 968 * Same as <code>spanBack(s.getBuffer(), limit, spanCondition)</code> 969 * after pinning limit to 0<=end<=s.length(). 970 * @param s the string 971 * @param limit the exclusive-end index in the string for the span operation 972 * (use s.length() or INT32_MAX for spanning back from the end of the string) 973 * @param spanCondition specifies the containment condition 974 * @return the start of the substring according to the spanCondition; 975 * the substring s.tempSubStringBetween(start, limit) fulfills the spanCondition 976 * @stable ICU 4.4 977 * @see USetSpanCondition 978 */ 979 inline int32_t spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const; 980 981 /** 982 * Returns the length of the initial substring of the input string which 983 * consists only of characters and strings that are contained in this set 984 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 985 * or only of characters and strings that are not contained 986 * in this set (USET_SPAN_NOT_CONTAINED). 987 * See USetSpanCondition for details. 988 * Similar to the strspn() C library function. 989 * Malformed byte sequences are treated according to contains(0xfffd). 990 * This function works faster with a frozen set and with a non-negative string length argument. 991 * @param s start of the string (UTF-8) 992 * @param length of the string; can be -1 for NUL-terminated 993 * @param spanCondition specifies the containment condition 994 * @return the length of the initial substring according to the spanCondition; 995 * 0 if the start of the string does not fit the spanCondition 996 * @stable ICU 3.8 997 * @see USetSpanCondition 998 */ 999 int32_t spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const; 1000 1001 /** 1002 * Returns the start of the trailing substring of the input string which 1003 * consists only of characters and strings that are contained in this set 1004 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 1005 * or only of characters and strings that are not contained 1006 * in this set (USET_SPAN_NOT_CONTAINED). 1007 * See USetSpanCondition for details. 1008 * Malformed byte sequences are treated according to contains(0xfffd). 1009 * This function works faster with a frozen set and with a non-negative string length argument. 1010 * @param s start of the string (UTF-8) 1011 * @param length of the string; can be -1 for NUL-terminated 1012 * @param spanCondition specifies the containment condition 1013 * @return the start of the trailing substring according to the spanCondition; 1014 * the string length if the end of the string does not fit the spanCondition 1015 * @stable ICU 3.8 1016 * @see USetSpanCondition 1017 */ 1018 int32_t spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const; 1019 1020 /** 1021 * Implement UnicodeMatcher::matches() 1022 * @stable ICU 2.4 1023 */ 1024 UMatchDegree matches(const Replaceable& text, 1025 int32_t& offset, 1026 int32_t limit, 1027 UBool incremental) override; 1028 1029 private: 1030 /** 1031 * Returns the longest match for s in text at the given position. 1032 * If limit > start then match forward from start+1 to limit 1033 * matching all characters except s.charAt(0). If limit < start, 1034 * go backward starting from start-1 matching all characters 1035 * except s.charAt(s.length()-1). This method assumes that the 1036 * first character, text.charAt(start), matches s, so it does not 1037 * check it. 1038 * @param text the text to match 1039 * @param start the first character to match. In the forward 1040 * direction, text.charAt(start) is matched against s.charAt(0). 1041 * In the reverse direction, it is matched against 1042 * s.charAt(s.length()-1). 1043 * @param limit the limit offset for matching, either last+1 in 1044 * the forward direction, or last-1 in the reverse direction, 1045 * where last is the index of the last character to match. 1046 * @param s 1047 * @return If part of s matches up to the limit, return |limit - 1048 * start|. If all of s matches before reaching the limit, return 1049 * s.length(). If there is a mismatch between s and text, return 1050 * 0 1051 */ 1052 static int32_t matchRest(const Replaceable& text, 1053 int32_t start, int32_t limit, 1054 const UnicodeString& s); 1055 1056 /** 1057 * Returns the smallest value i such that c < list[i]. Caller 1058 * must ensure that c is a legal value or this method will enter 1059 * an infinite loop. This method performs a binary search. 1060 * @param c a character in the range MIN_VALUE..MAX_VALUE 1061 * inclusive 1062 * @return the smallest integer i in the range 0..len-1, 1063 * inclusive, such that c < list[i] 1064 */ 1065 int32_t findCodePoint(UChar32 c) const; 1066 1067 public: 1068 1069 /** 1070 * Implementation of UnicodeMatcher API. Union the set of all 1071 * characters that may be matched by this object into the given 1072 * set. 1073 * @param toUnionTo the set into which to union the source characters 1074 * @stable ICU 2.4 1075 */ 1076 virtual void addMatchSetTo(UnicodeSet& toUnionTo) const override; 1077 1078 /** 1079 * Returns the index of the given character within this set, where 1080 * the set is ordered by ascending code point. If the character 1081 * is not in this set, return -1. The inverse of this method is 1082 * <code>charAt()</code>. 1083 * @return an index from 0..size()-1, or -1 1084 * @stable ICU 2.4 1085 */ 1086 int32_t indexOf(UChar32 c) const; 1087 1088 /** 1089 * Returns the character at the given index within this set, where 1090 * the set is ordered by ascending code point. If the index is 1091 * out of range for characters, returns (UChar32)-1. 1092 * The inverse of this method is <code>indexOf()</code>. 1093 * 1094 * For iteration, this is slower than UnicodeSetIterator or 1095 * getRangeCount()/getRangeStart()/getRangeEnd(), 1096 * because for each call it skips linearly over <code>index</code> 1097 * characters in the ranges. 1098 * 1099 * @param index an index from 0..size()-1 1100 * @return the character at the given index, or (UChar32)-1. 1101 * @stable ICU 2.4 1102 */ 1103 UChar32 charAt(int32_t index) const; 1104 1105 /** 1106 * Returns a C++ "range" for iterating over the code points of this set. 1107 * 1108 * \code 1109 * UnicodeSet set(u"[abcçカ🚴]", errorCode); 1110 * for (UChar32 c : set.codePoints()) { 1111 * printf("set.codePoint U+%04lx\n", (long)c); 1112 * } 1113 * \endcode 1114 * 1115 * @return a "range" object for iterating over the code points of this set. 1116 * @stable ICU 76 1117 * @see ranges 1118 * @see strings 1119 * @see begin 1120 * @see end 1121 */ 1122 inline U_HEADER_NESTED_NAMESPACE::USetCodePoints codePoints() const { 1123 return U_HEADER_NESTED_NAMESPACE::USetCodePoints(toUSet()); 1124 } 1125 1126 /** 1127 * Returns a C++ "range" for iterating over the code point ranges of this set. 1128 * 1129 * \code 1130 * UnicodeSet set(u"[abcçカ🚴]", errorCode); 1131 * for (auto [start, end] : set.ranges()) { 1132 * printf("set.range U+%04lx..U+%04lx\n", (long)start, (long)end); 1133 * } 1134 * for (auto range : set.ranges()) { 1135 * for (UChar32 c : range) { 1136 * printf("set.range.c U+%04lx\n", (long)c); 1137 * } 1138 * } 1139 * \endcode 1140 * 1141 * @return a "range" object for iterating over the code point ranges of this set. 1142 * @stable ICU 76 1143 * @see codePoints 1144 * @see strings 1145 * @see begin 1146 * @see end 1147 */ 1148 inline U_HEADER_NESTED_NAMESPACE::USetRanges ranges() const { 1149 return U_HEADER_NESTED_NAMESPACE::USetRanges(toUSet()); 1150 } 1151 1152 /** 1153 * Returns a C++ "range" for iterating over the empty and multi-character strings of this set. 1154 * Returns each string as a std::u16string_view without copying its contents. 1155 * 1156 * \code 1157 * UnicodeSet set(u"[abcçカ🚴{}{abc}{de}]", errorCode); 1158 * for (auto s : set.strings()) { 1159 * UnicodeString us(s); 1160 * std::string u8; 1161 * printf("set.string length %ld \"%s\"\n", (long)s.length(), us.toUTF8String(u8).c_str()); 1162 * } 1163 * \endcode 1164 * 1165 * @return a "range" object for iterating over the strings of this set. 1166 * @stable ICU 76 1167 * @see codePoints 1168 * @see ranges 1169 * @see begin 1170 * @see end 1171 */ 1172 inline U_HEADER_NESTED_NAMESPACE::USetStrings strings() const { 1173 return U_HEADER_NESTED_NAMESPACE::USetStrings(toUSet()); 1174 } 1175 1176 #ifndef U_HIDE_DRAFT_API 1177 /** 1178 * Returns a C++ iterator for iterating over all of the elements of this set. 1179 * Convenient all-in one iteration, but creates a std::u16string for each 1180 * code point or string. 1181 * (Similar to how Java UnicodeSet *is an* Iterable<String>.) 1182 * 1183 * Code points are returned first, then empty and multi-character strings. 1184 * 1185 * \code 1186 * UnicodeSet set(u"[abcçカ🚴{}{abc}{de}]", errorCode); 1187 * for (auto el : set) { 1188 * UnicodeString us(el); 1189 * std::string u8; 1190 * printf("set.element length %ld \"%s\"\n", (long)us.length(), us.toUTF8String(u8).c_str()); 1191 * } 1192 * \endcode 1193 * 1194 * @return an all-elements iterator. 1195 * @draft ICU 77 1196 * @see end 1197 * @see codePoints 1198 * @see ranges 1199 * @see strings 1200 */ 1201 inline U_HEADER_NESTED_NAMESPACE::USetElementIterator begin() const { 1202 return U_HEADER_NESTED_NAMESPACE::USetElements(toUSet()).begin(); 1203 } 1204 1205 /** 1206 * @return an exclusive-end sentinel for iterating over all of the elements of this set. 1207 * @draft ICU 77 1208 * @see begin 1209 * @see codePoints 1210 * @see ranges 1211 * @see strings 1212 */ 1213 inline U_HEADER_NESTED_NAMESPACE::USetElementIterator end() const { 1214 return U_HEADER_NESTED_NAMESPACE::USetElements(toUSet()).end(); 1215 } 1216 #endif // U_HIDE_DRAFT_API 1217 1218 /** 1219 * Adds the specified range to this set if it is not already 1220 * present. If this set already contains the specified range, 1221 * the call leaves this set unchanged. If <code>start > end</code> 1222 * then an empty range is added, leaving the set unchanged. 1223 * This is equivalent to a boolean logic OR, or a set UNION. 1224 * A frozen set will not be modified. 1225 * 1226 * @param start first character, inclusive, of range to be added 1227 * to this set. 1228 * @param end last character, inclusive, of range to be added 1229 * to this set. 1230 * @stable ICU 2.0 1231 */ 1232 UnicodeSet& add(UChar32 start, UChar32 end); 1233 1234 /** 1235 * Adds the specified character to this set if it is not already 1236 * present. If this set already contains the specified character, 1237 * the call leaves this set unchanged. 1238 * A frozen set will not be modified. 1239 * 1240 * @param c the character (code point) 1241 * @return this object, for chaining 1242 * @stable ICU 2.0 1243 */ 1244 UnicodeSet& add(UChar32 c); 1245 1246 /** 1247 * Adds the specified multicharacter to this set if it is not already 1248 * present. If this set already contains the multicharacter, 1249 * the call leaves this set unchanged. 1250 * Thus "ch" => {"ch"} 1251 * A frozen set will not be modified. 1252 * 1253 * @param s the source string 1254 * @return this object, for chaining 1255 * @stable ICU 2.4 1256 */ 1257 UnicodeSet& add(const UnicodeString& s); 1258 1259 private: 1260 /** 1261 * @return a code point IF the string consists of a single one. 1262 * otherwise returns -1. 1263 * @param s string to test 1264 */ 1265 static int32_t getSingleCP(const UnicodeString& s); 1266 1267 void _add(const UnicodeString& s); 1268 1269 public: 1270 /** 1271 * Adds each of the characters in this string to the set. Note: "ch" => {"c", "h"} 1272 * If this set already contains any particular character, it has no effect on that character. 1273 * A frozen set will not be modified. 1274 * @param s the source string 1275 * @return this object, for chaining 1276 * @stable ICU 2.4 1277 */ 1278 UnicodeSet& addAll(const UnicodeString& s); 1279 1280 /** 1281 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"} 1282 * A frozen set will not be modified. 1283 * @param s the source string 1284 * @return this object, for chaining 1285 * @stable ICU 2.4 1286 */ 1287 UnicodeSet& retainAll(const UnicodeString& s); 1288 1289 /** 1290 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"} 1291 * A frozen set will not be modified. 1292 * @param s the source string 1293 * @return this object, for chaining 1294 * @stable ICU 2.4 1295 */ 1296 UnicodeSet& complementAll(const UnicodeString& s); 1297 1298 /** 1299 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"} 1300 * A frozen set will not be modified. 1301 * @param s the source string 1302 * @return this object, for chaining 1303 * @stable ICU 2.4 1304 */ 1305 UnicodeSet& removeAll(const UnicodeString& s); 1306 1307 /** 1308 * Makes a set from a multicharacter string. Thus "ch" => {"ch"} 1309 * 1310 * @param s the source string 1311 * @return a newly created set containing the given string. 1312 * The caller owns the return object and is responsible for deleting it. 1313 * @stable ICU 2.4 1314 */ 1315 static UnicodeSet* U_EXPORT2 createFrom(const UnicodeString& s); 1316 1317 1318 /** 1319 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"} 1320 * @param s the source string 1321 * @return a newly created set containing the given characters 1322 * The caller owns the return object and is responsible for deleting it. 1323 * @stable ICU 2.4 1324 */ 1325 static UnicodeSet* U_EXPORT2 createFromAll(const UnicodeString& s); 1326 1327 /** 1328 * Retain only the elements in this set that are contained in the 1329 * specified range. If <code>start > end</code> then an empty range is 1330 * retained, leaving the set empty. This is equivalent to 1331 * a boolean logic AND, or a set INTERSECTION. 1332 * A frozen set will not be modified. 1333 * 1334 * @param start first character, inclusive, of range 1335 * @param end last character, inclusive, of range 1336 * @stable ICU 2.0 1337 */ 1338 UnicodeSet& retain(UChar32 start, UChar32 end); 1339 1340 1341 /** 1342 * Retain the specified character from this set if it is present. 1343 * A frozen set will not be modified. 1344 * 1345 * @param c the character (code point) 1346 * @return this object, for chaining 1347 * @stable ICU 2.0 1348 */ 1349 UnicodeSet& retain(UChar32 c); 1350 1351 /** 1352 * Retains only the specified string from this set if it is present. 1353 * Upon return this set will be empty if it did not contain s, or 1354 * will only contain s if it did contain s. 1355 * A frozen set will not be modified. 1356 * 1357 * @param s the source string 1358 * @return this object, for chaining 1359 * @stable ICU 69 1360 */ 1361 UnicodeSet& retain(const UnicodeString &s); 1362 1363 /** 1364 * Removes the specified range from this set if it is present. 1365 * The set will not contain the specified range once the call 1366 * returns. If <code>start > end</code> then an empty range is 1367 * removed, leaving the set unchanged. 1368 * A frozen set will not be modified. 1369 * 1370 * @param start first character, inclusive, of range to be removed 1371 * from this set. 1372 * @param end last character, inclusive, of range to be removed 1373 * from this set. 1374 * @stable ICU 2.0 1375 */ 1376 UnicodeSet& remove(UChar32 start, UChar32 end); 1377 1378 /** 1379 * Removes the specified character from this set if it is present. 1380 * The set will not contain the specified range once the call 1381 * returns. 1382 * A frozen set will not be modified. 1383 * 1384 * @param c the character (code point) 1385 * @return this object, for chaining 1386 * @stable ICU 2.0 1387 */ 1388 UnicodeSet& remove(UChar32 c); 1389 1390 /** 1391 * Removes the specified string from this set if it is present. 1392 * The set will not contain the specified character once the call 1393 * returns. 1394 * A frozen set will not be modified. 1395 * @param s the source string 1396 * @return this object, for chaining 1397 * @stable ICU 2.4 1398 */ 1399 UnicodeSet& remove(const UnicodeString& s); 1400 1401 /** 1402 * This is equivalent to 1403 * <code>complement(MIN_VALUE, MAX_VALUE)</code>. 1404 * 1405 * <strong>Note:</strong> This performs a symmetric difference with all code points 1406 * <em>and thus retains all multicharacter strings</em>. 1407 * In order to achieve a “code point complement” (all code points minus this set), 1408 * the easiest is to <code>.complement().removeAllStrings()</code>. 1409 * 1410 * A frozen set will not be modified. 1411 * @stable ICU 2.0 1412 */ 1413 UnicodeSet& complement(); 1414 1415 /** 1416 * Complements the specified range in this set. Any character in 1417 * the range will be removed if it is in this set, or will be 1418 * added if it is not in this set. If <code>start > end</code> 1419 * then an empty range is complemented, leaving the set unchanged. 1420 * This is equivalent to a boolean logic XOR. 1421 * A frozen set will not be modified. 1422 * 1423 * @param start first character, inclusive, of range 1424 * @param end last character, inclusive, of range 1425 * @stable ICU 2.0 1426 */ 1427 UnicodeSet& complement(UChar32 start, UChar32 end); 1428 1429 /** 1430 * Complements the specified character in this set. The character 1431 * will be removed if it is in this set, or will be added if it is 1432 * not in this set. 1433 * A frozen set will not be modified. 1434 * 1435 * @param c the character (code point) 1436 * @return this object, for chaining 1437 * @stable ICU 2.0 1438 */ 1439 UnicodeSet& complement(UChar32 c); 1440 1441 /** 1442 * Complement the specified string in this set. 1443 * The string will be removed if it is in this set, or will be added if it is not in this set. 1444 * A frozen set will not be modified. 1445 * 1446 * @param s the string to complement 1447 * @return this object, for chaining 1448 * @stable ICU 2.4 1449 */ 1450 UnicodeSet& complement(const UnicodeString& s); 1451 1452 /** 1453 * Adds all of the elements in the specified set to this set if 1454 * they're not already present. This operation effectively 1455 * modifies this set so that its value is the <i>union</i> of the two 1456 * sets. The behavior of this operation is unspecified if the specified 1457 * collection is modified while the operation is in progress. 1458 * A frozen set will not be modified. 1459 * 1460 * @param c set whose elements are to be added to this set. 1461 * @see #add(UChar32, UChar32) 1462 * @stable ICU 2.0 1463 */ 1464 UnicodeSet& addAll(const UnicodeSet& c); 1465 1466 /** 1467 * Retains only the elements in this set that are contained in the 1468 * specified set. In other words, removes from this set all of 1469 * its elements that are not contained in the specified set. This 1470 * operation effectively modifies this set so that its value is 1471 * the <i>intersection</i> of the two sets. 1472 * A frozen set will not be modified. 1473 * 1474 * @param c set that defines which elements this set will retain. 1475 * @stable ICU 2.0 1476 */ 1477 UnicodeSet& retainAll(const UnicodeSet& c); 1478 1479 /** 1480 * Removes from this set all of its elements that are contained in the 1481 * specified set. This operation effectively modifies this 1482 * set so that its value is the <i>asymmetric set difference</i> of 1483 * the two sets. 1484 * A frozen set will not be modified. 1485 * 1486 * @param c set that defines which elements will be removed from 1487 * this set. 1488 * @stable ICU 2.0 1489 */ 1490 UnicodeSet& removeAll(const UnicodeSet& c); 1491 1492 /** 1493 * Complements in this set all elements contained in the specified 1494 * set. Any character in the other set will be removed if it is 1495 * in this set, or will be added if it is not in this set. 1496 * A frozen set will not be modified. 1497 * 1498 * @param c set that defines which elements will be xor'ed from 1499 * this set. 1500 * @stable ICU 2.4 1501 */ 1502 UnicodeSet& complementAll(const UnicodeSet& c); 1503 1504 /** 1505 * Removes all of the elements from this set. This set will be 1506 * empty after this call returns. 1507 * A frozen set will not be modified. 1508 * @stable ICU 2.0 1509 */ 1510 UnicodeSet& clear(); 1511 1512 /** 1513 * Close this set over the given attribute. For the attribute 1514 * USET_CASE_INSENSITIVE, the result is to modify this set so that: 1515 * 1516 * 1. For each character or string 'a' in this set, all strings or 1517 * characters 'b' such that foldCase(a) == foldCase(b) are added 1518 * to this set. 1519 * 1520 * 2. For each string 'e' in the resulting set, if e != 1521 * foldCase(e), 'e' will be removed. 1522 * 1523 * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}] 1524 * 1525 * (Here foldCase(x) refers to the operation u_strFoldCase, and a 1526 * == b denotes that the contents are the same, not pointer 1527 * comparison.) 1528 * 1529 * A frozen set will not be modified. 1530 * 1531 * @param attribute bitmask for attributes to close over. 1532 * Valid options: 1533 * At most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE. 1534 * These case options are mutually exclusive. 1535 * Unrelated options bits are ignored. 1536 * @return a reference to this set. 1537 * @stable ICU 4.2 1538 */ 1539 UnicodeSet& closeOver(int32_t attribute); 1540 1541 /** 1542 * Remove all strings from this set. 1543 * 1544 * @return a reference to this set. 1545 * @stable ICU 4.2 1546 */ 1547 UnicodeSet &removeAllStrings(); 1548 1549 /** 1550 * Iteration method that returns the number of ranges contained in 1551 * this set. 1552 * @see #getRangeStart 1553 * @see #getRangeEnd 1554 * @stable ICU 2.4 1555 */ 1556 int32_t getRangeCount() const; 1557 1558 /** 1559 * Iteration method that returns the first character in the 1560 * specified range of this set. 1561 * @see #getRangeCount 1562 * @see #getRangeEnd 1563 * @stable ICU 2.4 1564 */ 1565 UChar32 getRangeStart(int32_t index) const; 1566 1567 /** 1568 * Iteration method that returns the last character in the 1569 * specified range of this set. 1570 * @see #getRangeStart 1571 * @see #getRangeEnd 1572 * @stable ICU 2.4 1573 */ 1574 UChar32 getRangeEnd(int32_t index) const; 1575 1576 /** 1577 * Serializes this set into an array of 16-bit integers. Serialization 1578 * (currently) only records the characters in the set; multicharacter 1579 * strings are ignored. 1580 * 1581 * The array has following format (each line is one 16-bit 1582 * integer): 1583 * 1584 * length = (n+2*m) | (m!=0?0x8000:0) 1585 * bmpLength = n; present if m!=0 1586 * bmp[0] 1587 * bmp[1] 1588 * ... 1589 * bmp[n-1] 1590 * supp-high[0] 1591 * supp-low[0] 1592 * supp-high[1] 1593 * supp-low[1] 1594 * ... 1595 * supp-high[m-1] 1596 * supp-low[m-1] 1597 * 1598 * The array starts with a header. After the header are n bmp 1599 * code points, then m supplementary code points. Either n or m 1600 * or both may be zero. n+2*m is always <= 0x7FFF. 1601 * 1602 * If there are no supplementary characters (if m==0) then the 1603 * header is one 16-bit integer, 'length', with value n. 1604 * 1605 * If there are supplementary characters (if m!=0) then the header 1606 * is two 16-bit integers. The first, 'length', has value 1607 * (n+2*m)|0x8000. The second, 'bmpLength', has value n. 1608 * 1609 * After the header the code points are stored in ascending order. 1610 * Supplementary code points are stored as most significant 16 1611 * bits followed by least significant 16 bits. 1612 * 1613 * @param dest pointer to buffer of destCapacity 16-bit integers. 1614 * May be nullptr only if destCapacity is zero. 1615 * @param destCapacity size of dest, or zero. Must not be negative. 1616 * @param ec error code. Will be set to U_INDEX_OUTOFBOUNDS_ERROR 1617 * if n+2*m > 0x7FFF. Will be set to U_BUFFER_OVERFLOW_ERROR if 1618 * n+2*m+(m!=0?2:1) > destCapacity. 1619 * @return the total length of the serialized format, including 1620 * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other 1621 * than U_BUFFER_OVERFLOW_ERROR. 1622 * @stable ICU 2.4 1623 */ 1624 int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const; 1625 1626 /** 1627 * Reallocate this objects internal structures to take up the least 1628 * possible space, without changing this object's value. 1629 * A frozen set will not be modified. 1630 * @stable ICU 2.4 1631 */ 1632 UnicodeSet& compact(); 1633 1634 /** 1635 * Return the class ID for this class. This is useful only for 1636 * comparing to a return value from getDynamicClassID(). For example: 1637 * <pre> 1638 * . Base* polymorphic_pointer = createPolymorphicObject(); 1639 * . if (polymorphic_pointer->getDynamicClassID() == 1640 * . Derived::getStaticClassID()) ... 1641 * </pre> 1642 * @return The class ID for all objects of this class. 1643 * @stable ICU 2.0 1644 */ 1645 static UClassID U_EXPORT2 getStaticClassID(); 1646 1647 /** 1648 * Implement UnicodeFunctor API. 1649 * 1650 * @return The class ID for this object. All objects of a given 1651 * class have the same class ID. Objects of other classes have 1652 * different class IDs. 1653 * @stable ICU 2.4 1654 */ 1655 virtual UClassID getDynamicClassID() const override; 1656 1657 private: 1658 1659 // Private API for the USet API 1660 1661 friend class USetAccess; 1662 1663 const UnicodeString* getString(int32_t index) const; 1664 1665 //---------------------------------------------------------------- 1666 // RuleBasedTransliterator support 1667 //---------------------------------------------------------------- 1668 1669 private: 1670 1671 /** 1672 * Returns <tt>true</tt> if this set contains any character whose low byte 1673 * is the given value. This is used by <tt>RuleBasedTransliterator</tt> for 1674 * indexing. 1675 */ 1676 virtual UBool matchesIndexValue(uint8_t v) const override; 1677 1678 private: 1679 friend class RBBIRuleScanner; 1680 1681 //---------------------------------------------------------------- 1682 // Implementation: Clone as thawed (see ICU4J Freezable) 1683 //---------------------------------------------------------------- 1684 1685 UnicodeSet(const UnicodeSet& o, UBool /* asThawed */); 1686 UnicodeSet& copyFrom(const UnicodeSet& o, UBool asThawed); 1687 1688 //---------------------------------------------------------------- 1689 // Implementation: Pattern parsing 1690 //---------------------------------------------------------------- 1691 1692 void applyPatternIgnoreSpace(const UnicodeString& pattern, 1693 ParsePosition& pos, 1694 const SymbolTable* symbols, 1695 UErrorCode& status); 1696 1697 void applyPattern(RuleCharacterIterator& chars, 1698 const SymbolTable* symbols, 1699 UnicodeString& rebuiltPat, 1700 uint32_t options, 1701 UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute), 1702 int32_t depth, 1703 UErrorCode& ec); 1704 1705 void closeOverCaseInsensitive(bool simple); 1706 void closeOverAddCaseMappings(); 1707 1708 //---------------------------------------------------------------- 1709 // Implementation: Utility methods 1710 //---------------------------------------------------------------- 1711 1712 static int32_t nextCapacity(int32_t minCapacity); 1713 1714 bool ensureCapacity(int32_t newLen); 1715 1716 bool ensureBufferCapacity(int32_t newLen); 1717 1718 void swapBuffers(); 1719 1720 UBool allocateStrings(UErrorCode &status); 1721 int32_t stringsSize() const; 1722 UBool stringsContains(const UnicodeString &s) const; 1723 1724 UnicodeString& _toPattern(UnicodeString& result, 1725 UBool escapeUnprintable) const; 1726 1727 UnicodeString& _generatePattern(UnicodeString& result, 1728 UBool escapeUnprintable) const; 1729 1730 static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable); 1731 1732 static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable); 1733 1734 static void _appendToPat(UnicodeString &result, UChar32 start, UChar32 end, 1735 UBool escapeUnprintable); 1736 1737 //---------------------------------------------------------------- 1738 // Implementation: Fundamental operators 1739 //---------------------------------------------------------------- 1740 1741 void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity); 1742 1743 void add(const UChar32* other, int32_t otherLen, int8_t polarity); 1744 1745 void retain(const UChar32* other, int32_t otherLen, int8_t polarity); 1746 1747 /** 1748 * Return true if the given position, in the given pattern, appears 1749 * to be the start of a property set pattern [:foo:], \\p{foo}, or 1750 * \\P{foo}, or \\N{name}. 1751 */ 1752 static UBool resemblesPropertyPattern(const UnicodeString& pattern, 1753 int32_t pos); 1754 1755 static UBool resemblesPropertyPattern(RuleCharacterIterator& chars, 1756 int32_t iterOpts); 1757 1758 /** 1759 * Parse the given property pattern at the given parse position 1760 * and set this UnicodeSet to the result. 1761 * 1762 * The original design document is out of date, but still useful. 1763 * Ignore the property and value names: 1764 * https://htmlpreview.github.io/?https://github.com/unicode-org/icu-docs/blob/main/design/unicodeset_properties.html 1765 * 1766 * Recognized syntax: 1767 * 1768 * [:foo:] [:^foo:] - white space not allowed within "[:" or ":]" 1769 * \\p{foo} \\P{foo} - white space not allowed within "\\p" or "\\P" 1770 * \\N{name} - white space not allowed within "\\N" 1771 * 1772 * Other than the above restrictions, Unicode Pattern_White_Space characters are ignored. 1773 * Case is ignored except in "\\p" and "\\P" and "\\N". In 'name' leading 1774 * and trailing space is deleted, and internal runs of whitespace 1775 * are collapsed to a single space. 1776 * 1777 * We support binary properties, enumerated properties, and the 1778 * following non-enumerated properties: 1779 * 1780 * Numeric_Value 1781 * Name 1782 * Unicode_1_Name 1783 * 1784 * @param pattern the pattern string 1785 * @param ppos on entry, the position at which to begin parsing. 1786 * This should be one of the locations marked '^': 1787 * 1788 * [:blah:] \\p{blah} \\P{blah} \\N{name} 1789 * ^ % ^ % ^ % ^ % 1790 * 1791 * On return, the position after the last character parsed, that is, 1792 * the locations marked '%'. If the parse fails, ppos is returned 1793 * unchanged. 1794 * @param ec status 1795 * @return a reference to this. 1796 */ 1797 UnicodeSet& applyPropertyPattern(const UnicodeString& pattern, 1798 ParsePosition& ppos, 1799 UErrorCode &ec); 1800 1801 void applyPropertyPattern(RuleCharacterIterator& chars, 1802 UnicodeString& rebuiltPat, 1803 UErrorCode& ec); 1804 1805 /** 1806 * A filter that returns true if the given code point should be 1807 * included in the UnicodeSet being constructed. 1808 */ 1809 typedef UBool (*Filter)(UChar32 codePoint, void* context); 1810 1811 /** 1812 * Given a filter, set this UnicodeSet to the code points 1813 * contained by that filter. The filter MUST be 1814 * property-conformant. That is, if it returns value v for one 1815 * code point, then it must return v for all affiliated code 1816 * points, as defined by the inclusions list. See 1817 * getInclusions(). 1818 * src is a UPropertySource value. 1819 */ 1820 void applyFilter(Filter filter, 1821 void* context, 1822 const UnicodeSet* inclusions, 1823 UErrorCode &status); 1824 1825 /** 1826 * Set the new pattern to cache. 1827 */ 1828 void setPattern(const UnicodeString& newPat) { 1829 setPattern(newPat.getBuffer(), newPat.length()); 1830 } 1831 void setPattern(const char16_t *newPat, int32_t newPatLen); 1832 /** 1833 * Release existing cached pattern. 1834 */ 1835 void releasePattern(); 1836 1837 friend class UnicodeSetIterator; 1838 }; 1839 1840 1841 1842 inline bool UnicodeSet::operator!=(const UnicodeSet& o) const { 1843 return !operator==(o); 1844 } 1845 1846 inline UBool UnicodeSet::isFrozen() const { 1847 return bmpSet != nullptr || stringSpan != nullptr; 1848 } 1849 1850 inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const { 1851 return !containsNone(start, end); 1852 } 1853 1854 inline UBool UnicodeSet::containsSome(const UnicodeSet& s) const { 1855 return !containsNone(s); 1856 } 1857 1858 inline UBool UnicodeSet::containsSome(const UnicodeString& s) const { 1859 return !containsNone(s); 1860 } 1861 1862 inline UBool UnicodeSet::isBogus() const { 1863 return fFlags & kIsBogus; 1864 } 1865 1866 inline UnicodeSet *UnicodeSet::fromUSet(USet *uset) { 1867 return reinterpret_cast<UnicodeSet *>(uset); 1868 } 1869 1870 inline const UnicodeSet *UnicodeSet::fromUSet(const USet *uset) { 1871 return reinterpret_cast<const UnicodeSet *>(uset); 1872 } 1873 1874 inline USet *UnicodeSet::toUSet() { 1875 return reinterpret_cast<USet *>(this); 1876 } 1877 1878 inline const USet *UnicodeSet::toUSet() const { 1879 return reinterpret_cast<const USet *>(this); 1880 } 1881 1882 inline int32_t UnicodeSet::span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const { 1883 int32_t sLength=s.length(); 1884 if(start<0) { 1885 start=0; 1886 } else if(start>sLength) { 1887 start=sLength; 1888 } 1889 return start+span(s.getBuffer()+start, sLength-start, spanCondition); 1890 } 1891 1892 inline int32_t UnicodeSet::spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const { 1893 int32_t sLength=s.length(); 1894 if(limit<0) { 1895 limit=0; 1896 } else if(limit>sLength) { 1897 limit=sLength; 1898 } 1899 return spanBack(s.getBuffer(), limit, spanCondition); 1900 } 1901 1902 U_NAMESPACE_END 1903 1904 #endif /* U_SHOW_CPLUSPLUS_API */ 1905 1906 #endif