uset.h (64545B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2002-2014, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: uset.h 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2002mar07 16 * created by: Markus W. Scherer 17 * 18 * C version of UnicodeSet. 19 */ 20 21 22 /** 23 * \file 24 * \brief C API: Unicode Set 25 * 26 * <p>This is a C wrapper around the C++ UnicodeSet class.</p> 27 */ 28 29 #ifndef __USET_H__ 30 #define __USET_H__ 31 32 #include "unicode/utypes.h" 33 #include "unicode/uchar.h" 34 35 #if U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API 36 #include <string> 37 #include <string_view> 38 #include "unicode/char16ptr.h" 39 #include "unicode/localpointer.h" 40 #include "unicode/utf16.h" 41 #endif 42 43 #ifndef USET_DEFINED 44 45 #ifndef U_IN_DOXYGEN 46 #define USET_DEFINED 47 #endif 48 /** 49 * USet is the C API type corresponding to C++ class UnicodeSet. 50 * Use the uset_* API to manipulate. Create with 51 * uset_open*, and destroy with uset_close. 52 * @stable ICU 2.4 53 */ 54 typedef struct USet USet; 55 #endif 56 57 /** 58 * Bitmask values to be passed to uset_openPatternOptions() or 59 * uset_applyPattern() taking an option parameter. 60 * 61 * Use at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE. 62 * These case options are mutually exclusive. 63 * 64 * Undefined options bits are ignored, and reserved for future use. 65 * 66 * @stable ICU 2.4 67 */ 68 enum { 69 /** 70 * Ignore white space within patterns unless quoted or escaped. 71 * @stable ICU 2.4 72 */ 73 USET_IGNORE_SPACE = 1, 74 75 /** 76 * Enable case insensitive matching. E.g., "[ab]" with this flag 77 * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will 78 * match all except 'a', 'A', 'b', and 'B'. This performs a full 79 * closure over case mappings, e.g. 'ſ' (U+017F long s) for 's'. 80 * 81 * The resulting set is a superset of the input for the code points but 82 * not for the strings. 83 * It performs a case mapping closure of the code points and adds 84 * full case folding strings for the code points, and reduces strings of 85 * the original set to their full case folding equivalents. 86 * 87 * This is designed for case-insensitive matches, for example 88 * in regular expressions. The full code point case closure allows checking of 89 * an input character directly against the closure set. 90 * Strings are matched by comparing the case-folded form from the closure 91 * set with an incremental case folding of the string in question. 92 * 93 * The closure set will also contain single code points if the original 94 * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.). 95 * This is not necessary (that is, redundant) for the above matching method 96 * but results in the same closure sets regardless of whether the original 97 * set contained the code point or a string. 98 * 99 * @stable ICU 2.4 100 */ 101 USET_CASE_INSENSITIVE = 2, 102 103 /** 104 * Adds all case mappings for each element in the set. 105 * This adds the full lower-, title-, and uppercase mappings as well as the full case folding 106 * of each existing element in the set. 107 * 108 * Unlike the “case insensitive” options, this does not perform a closure. 109 * For example, it does not add 'ſ' (U+017F long s) for 's', 110 * 'K' (U+212A Kelvin sign) for 'k', or replace set strings by their case-folded versions. 111 * 112 * @stable ICU 3.2 113 */ 114 USET_ADD_CASE_MAPPINGS = 4, 115 116 /** 117 * Enable case insensitive matching. 118 * Same as USET_CASE_INSENSITIVE but using only Simple_Case_Folding (scf) mappings, 119 * which map each code point to one code point, 120 * not full Case_Folding (cf) mappings, which map some code points to multiple code points. 121 * 122 * This is designed for case-insensitive matches, for example in certain 123 * regular expression implementations where only Simple_Case_Folding mappings are used, 124 * such as in ECMAScript (JavaScript) regular expressions. 125 * 126 * @stable ICU 73 127 */ 128 USET_SIMPLE_CASE_INSENSITIVE = 6 129 }; 130 131 /** 132 * Argument values for whether span() and similar functions continue while 133 * the current character is contained vs. not contained in the set. 134 * 135 * The functionality is straightforward for sets with only single code points, 136 * without strings (which is the common case): 137 * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE work the same. 138 * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE are inverses of USET_SPAN_NOT_CONTAINED. 139 * - span() and spanBack() partition any string the same way when 140 * alternating between span(USET_SPAN_NOT_CONTAINED) and 141 * span(either "contained" condition). 142 * - Using a complemented (inverted) set and the opposite span conditions 143 * yields the same results. 144 * 145 * When a set contains multi-code point strings, then these statements may not 146 * be true, depending on the strings in the set (for example, whether they 147 * overlap with each other) and the string that is processed. 148 * For a set with strings: 149 * - The complement of the set contains the opposite set of code points, 150 * but the same set of strings. 151 * Therefore, complementing both the set and the span conditions 152 * may yield different results. 153 * - When starting spans at different positions in a string 154 * (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different 155 * because a set string may start before the later position. 156 * - span(USET_SPAN_SIMPLE) may be shorter than 157 * span(USET_SPAN_CONTAINED) because it will not recursively try 158 * all possible paths. 159 * For example, with a set which contains the three strings "xy", "xya" and "ax", 160 * span("xyax", USET_SPAN_CONTAINED) will return 4 but 161 * span("xyax", USET_SPAN_SIMPLE) will return 3. 162 * span(USET_SPAN_SIMPLE) will never be longer than 163 * span(USET_SPAN_CONTAINED). 164 * - With either "contained" condition, span() and spanBack() may partition 165 * a string in different ways. 166 * For example, with a set which contains the two strings "ab" and "ba", 167 * and when processing the string "aba", 168 * span() will yield contained/not-contained boundaries of { 0, 2, 3 } 169 * while spanBack() will yield boundaries of { 0, 1, 3 }. 170 * 171 * Note: If it is important to get the same boundaries whether iterating forward 172 * or backward through a string, then either only span() should be used and 173 * the boundaries cached for backward operation, or an ICU BreakIterator 174 * could be used. 175 * 176 * Note: Unpaired surrogates are treated like surrogate code points. 177 * Similarly, set strings match only on code point boundaries, 178 * never in the middle of a surrogate pair. 179 * Illegal UTF-8 sequences are treated like U+FFFD. 180 * When processing UTF-8 strings, malformed set strings 181 * (strings with unpaired surrogates which cannot be converted to UTF-8) 182 * are ignored. 183 * 184 * @stable ICU 3.8 185 */ 186 typedef enum USetSpanCondition { 187 /** 188 * Continues a span() while there is no set element at the current position. 189 * Increments by one code point at a time. 190 * Stops before the first set element (character or string). 191 * (For code points only, this is like while contains(current)==false). 192 * 193 * When span() returns, the substring between where it started and the position 194 * it returned consists only of characters that are not in the set, 195 * and none of its strings overlap with the span. 196 * 197 * @stable ICU 3.8 198 */ 199 USET_SPAN_NOT_CONTAINED = 0, 200 /** 201 * Spans the longest substring that is a concatenation of set elements (characters or strings). 202 * (For characters only, this is like while contains(current)==true). 203 * 204 * When span() returns, the substring between where it started and the position 205 * it returned consists only of set elements (characters or strings) that are in the set. 206 * 207 * If a set contains strings, then the span will be the longest substring for which there 208 * exists at least one non-overlapping concatenation of set elements (characters or strings). 209 * This is equivalent to a POSIX regular expression for <code>(OR of each set element)*</code>. 210 * (Java/ICU/Perl regex stops at the first match of an OR.) 211 * 212 * @stable ICU 3.8 213 */ 214 USET_SPAN_CONTAINED = 1, 215 /** 216 * Continues a span() while there is a set element at the current position. 217 * Increments by the longest matching element at each position. 218 * (For characters only, this is like while contains(current)==true). 219 * 220 * When span() returns, the substring between where it started and the position 221 * it returned consists only of set elements (characters or strings) that are in the set. 222 * 223 * If a set only contains single characters, then this is the same 224 * as USET_SPAN_CONTAINED. 225 * 226 * If a set contains strings, then the span will be the longest substring 227 * with a match at each position with the longest single set element (character or string). 228 * 229 * Use this span condition together with other longest-match algorithms, 230 * such as ICU converters (ucnv_getUnicodeSet()). 231 * 232 * @stable ICU 3.8 233 */ 234 USET_SPAN_SIMPLE = 2, 235 #ifndef U_HIDE_DEPRECATED_API 236 /** 237 * One more than the last span condition. 238 * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420. 239 */ 240 USET_SPAN_CONDITION_COUNT 241 #endif // U_HIDE_DEPRECATED_API 242 } USetSpanCondition; 243 244 enum { 245 /** 246 * Capacity of USerializedSet::staticArray. 247 * Enough for any single-code point set. 248 * Also provides padding for nice sizeof(USerializedSet). 249 * @stable ICU 2.4 250 */ 251 USET_SERIALIZED_STATIC_ARRAY_CAPACITY=8 252 }; 253 254 /** 255 * A serialized form of a Unicode set. Limited manipulations are 256 * possible directly on a serialized set. See below. 257 * @stable ICU 2.4 258 */ 259 typedef struct USerializedSet { 260 /** 261 * The serialized Unicode Set. 262 * @stable ICU 2.4 263 */ 264 const uint16_t *array; 265 /** 266 * The length of the array that contains BMP characters. 267 * @stable ICU 2.4 268 */ 269 int32_t bmpLength; 270 /** 271 * The total length of the array. 272 * @stable ICU 2.4 273 */ 274 int32_t length; 275 /** 276 * A small buffer for the array to reduce memory allocations. 277 * @stable ICU 2.4 278 */ 279 uint16_t staticArray[USET_SERIALIZED_STATIC_ARRAY_CAPACITY]; 280 } USerializedSet; 281 282 /********************************************************************* 283 * USet API 284 *********************************************************************/ 285 286 /** 287 * Create an empty USet object. 288 * Equivalent to uset_open(1, 0). 289 * @return a newly created USet. The caller must call uset_close() on 290 * it when done. 291 * @stable ICU 4.2 292 */ 293 U_CAPI USet* U_EXPORT2 294 uset_openEmpty(void); 295 296 /** 297 * Creates a USet object that contains the range of characters 298 * start..end, inclusive. If <code>start > end</code> 299 * then an empty set is created (same as using uset_openEmpty()). 300 * @param start first character of the range, inclusive 301 * @param end last character of the range, inclusive 302 * @return a newly created USet. The caller must call uset_close() on 303 * it when done. 304 * @stable ICU 2.4 305 */ 306 U_CAPI USet* U_EXPORT2 307 uset_open(UChar32 start, UChar32 end); 308 309 /** 310 * Creates a set from the given pattern. See the UnicodeSet class 311 * description for the syntax of the pattern language. 312 * @param pattern a string specifying what characters are in the set 313 * @param patternLength the length of the pattern, or -1 if null 314 * terminated 315 * @param ec the error code 316 * @stable ICU 2.4 317 */ 318 U_CAPI USet* U_EXPORT2 319 uset_openPattern(const UChar* pattern, int32_t patternLength, 320 UErrorCode* ec); 321 322 /** 323 * Creates a set from the given pattern. See the UnicodeSet class 324 * description for the syntax of the pattern language. 325 * @param pattern a string specifying what characters are in the set 326 * @param patternLength the length of the pattern, or -1 if null 327 * terminated 328 * @param options bitmask for options to apply to the pattern. 329 * Valid options are USET_IGNORE_SPACE and 330 * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE. 331 * These case options are mutually exclusive. 332 * @param ec the error code 333 * @stable ICU 2.4 334 */ 335 U_CAPI USet* U_EXPORT2 336 uset_openPatternOptions(const UChar* pattern, int32_t patternLength, 337 uint32_t options, 338 UErrorCode* ec); 339 340 /** 341 * Disposes of the storage used by a USet object. This function should 342 * be called exactly once for objects returned by uset_open(). 343 * @param set the object to dispose of 344 * @stable ICU 2.4 345 */ 346 U_CAPI void U_EXPORT2 347 uset_close(USet* set); 348 349 #if U_SHOW_CPLUSPLUS_API 350 351 U_NAMESPACE_BEGIN 352 353 /** 354 * \class LocalUSetPointer 355 * "Smart pointer" class, closes a USet via uset_close(). 356 * For most methods see the LocalPointerBase base class. 357 * 358 * @see LocalPointerBase 359 * @see LocalPointer 360 * @stable ICU 4.4 361 */ 362 U_DEFINE_LOCAL_OPEN_POINTER(LocalUSetPointer, USet, uset_close); 363 364 U_NAMESPACE_END 365 366 #endif 367 368 /** 369 * Returns a copy of this object. 370 * If this set is frozen, then the clone will be frozen as well. 371 * Use uset_cloneAsThawed() for a mutable clone of a frozen set. 372 * @param set the original set 373 * @return the newly allocated copy of the set 374 * @see uset_cloneAsThawed 375 * @stable ICU 3.8 376 */ 377 U_CAPI USet * U_EXPORT2 378 uset_clone(const USet *set); 379 380 /** 381 * Determines whether the set has been frozen (made immutable) or not. 382 * See the ICU4J Freezable interface for details. 383 * @param set the set 384 * @return true/false for whether the set has been frozen 385 * @see uset_freeze 386 * @see uset_cloneAsThawed 387 * @stable ICU 3.8 388 */ 389 U_CAPI UBool U_EXPORT2 390 uset_isFrozen(const USet *set); 391 392 /** 393 * Freeze the set (make it immutable). 394 * Once frozen, it cannot be unfrozen and is therefore thread-safe 395 * until it is deleted. 396 * See the ICU4J Freezable interface for details. 397 * Freezing the set may also make some operations faster, for example 398 * uset_contains() and uset_span(). 399 * A frozen set will not be modified. (It remains frozen.) 400 * @param set the set 401 * @return the same set, now frozen 402 * @see uset_isFrozen 403 * @see uset_cloneAsThawed 404 * @stable ICU 3.8 405 */ 406 U_CAPI void U_EXPORT2 407 uset_freeze(USet *set); 408 409 /** 410 * Clone the set and make the clone mutable. 411 * See the ICU4J Freezable interface for details. 412 * @param set the set 413 * @return the mutable clone 414 * @see uset_freeze 415 * @see uset_isFrozen 416 * @see uset_clone 417 * @stable ICU 3.8 418 */ 419 U_CAPI USet * U_EXPORT2 420 uset_cloneAsThawed(const USet *set); 421 422 /** 423 * Causes the USet object to represent the range <code>start - end</code>. 424 * If <code>start > end</code> then this USet is set to an empty range. 425 * A frozen set will not be modified. 426 * @param set the object to set to the given range 427 * @param start first character in the set, inclusive 428 * @param end last character in the set, inclusive 429 * @stable ICU 3.2 430 */ 431 U_CAPI void U_EXPORT2 432 uset_set(USet* set, 433 UChar32 start, UChar32 end); 434 435 /** 436 * Modifies the set to represent the set specified by the given 437 * pattern. See the UnicodeSet class description for the syntax of 438 * the pattern language. See also the User Guide chapter about UnicodeSet. 439 * <em>Empties the set passed before applying the pattern.</em> 440 * A frozen set will not be modified. 441 * @param set The set to which the pattern is to be applied. 442 * @param pattern A pointer to UChar string specifying what characters are in the set. 443 * The character at pattern[0] must be a '['. 444 * @param patternLength The length of the UChar string. -1 if NUL terminated. 445 * @param options A bitmask for options to apply to the pattern. 446 * Valid options are USET_IGNORE_SPACE and 447 * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, 448 * USET_SIMPLE_CASE_INSENSITIVE. 449 * These case options are mutually exclusive. 450 * @param status Returns an error if the pattern cannot be parsed. 451 * @return Upon successful parse, the value is either 452 * the index of the character after the closing ']' 453 * of the parsed pattern. 454 * If the status code indicates failure, then the return value 455 * is the index of the error in the source. 456 * 457 * @stable ICU 2.8 458 */ 459 U_CAPI int32_t U_EXPORT2 460 uset_applyPattern(USet *set, 461 const UChar *pattern, int32_t patternLength, 462 uint32_t options, 463 UErrorCode *status); 464 465 /** 466 * Modifies the set to contain those code points which have the given value 467 * for the given binary or enumerated property, as returned by 468 * u_getIntPropertyValue. Prior contents of this set are lost. 469 * A frozen set will not be modified. 470 * 471 * @param set the object to contain the code points defined by the property 472 * 473 * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1 474 * or UCHAR_INT_START..UCHAR_INT_LIMIT-1 475 * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1. 476 * 477 * @param value a value in the range u_getIntPropertyMinValue(prop).. 478 * u_getIntPropertyMaxValue(prop), with one exception. If prop is 479 * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but 480 * rather a mask value produced by U_GET_GC_MASK(). This allows grouped 481 * categories such as [:L:] to be represented. 482 * 483 * @param ec error code input/output parameter 484 * 485 * @stable ICU 3.2 486 */ 487 U_CAPI void U_EXPORT2 488 uset_applyIntPropertyValue(USet* set, 489 UProperty prop, int32_t value, UErrorCode* ec); 490 491 /** 492 * Modifies the set to contain those code points which have the 493 * given value for the given property. Prior contents of this 494 * set are lost. 495 * A frozen set will not be modified. 496 * 497 * @param set the object to contain the code points defined by the given 498 * property and value alias 499 * 500 * @param prop a string specifying a property alias, either short or long. 501 * The name is matched loosely. See PropertyAliases.txt for names and a 502 * description of loose matching. If the value string is empty, then this 503 * string is interpreted as either a General_Category value alias, a Script 504 * value alias, a binary property alias, or a special ID. Special IDs are 505 * matched loosely and correspond to the following sets: 506 * 507 * "ANY" = [\\u0000-\\U0010FFFF], 508 * "ASCII" = [\\u0000-\\u007F], 509 * "Assigned" = [:^Cn:]. 510 * 511 * @param propLength the length of the prop, or -1 if NULL 512 * 513 * @param value a string specifying a value alias, either short or long. 514 * The name is matched loosely. See PropertyValueAliases.txt for names 515 * and a description of loose matching. In addition to aliases listed, 516 * numeric values and canonical combining classes may be expressed 517 * numerically, e.g., ("nv", "0.5") or ("ccc", "220"). The value string 518 * may also be empty. 519 * 520 * @param valueLength the length of the value, or -1 if NULL 521 * 522 * @param ec error code input/output parameter 523 * 524 * @stable ICU 3.2 525 */ 526 U_CAPI void U_EXPORT2 527 uset_applyPropertyAlias(USet* set, 528 const UChar *prop, int32_t propLength, 529 const UChar *value, int32_t valueLength, 530 UErrorCode* ec); 531 532 /** 533 * Return true if the given position, in the given pattern, appears 534 * to be the start of a UnicodeSet pattern. 535 * 536 * @param pattern a string specifying the pattern 537 * @param patternLength the length of the pattern, or -1 if NULL 538 * @param pos the given position 539 * @stable ICU 3.2 540 */ 541 U_CAPI UBool U_EXPORT2 542 uset_resemblesPattern(const UChar *pattern, int32_t patternLength, 543 int32_t pos); 544 545 /** 546 * Returns a string representation of this set. If the result of 547 * calling this function is passed to a uset_openPattern(), it 548 * will produce another set that is equal to this one. 549 * @param set the set 550 * @param result the string to receive the rules, may be NULL 551 * @param resultCapacity the capacity of result, may be 0 if result is NULL 552 * @param escapeUnprintable if true then convert unprintable 553 * character to their hex escape representations, \\uxxxx or 554 * \\Uxxxxxxxx. Unprintable characters are those other than 555 * U+000A, U+0020..U+007E. 556 * @param ec error code. 557 * @return length of string, possibly larger than resultCapacity 558 * @stable ICU 2.4 559 */ 560 U_CAPI int32_t U_EXPORT2 561 uset_toPattern(const USet* set, 562 UChar* result, int32_t resultCapacity, 563 UBool escapeUnprintable, 564 UErrorCode* ec); 565 566 /** 567 * Adds the given character to the given USet. After this call, 568 * uset_contains(set, c) will return true. 569 * A frozen set will not be modified. 570 * @param set the object to which to add the character 571 * @param c the character to add 572 * @stable ICU 2.4 573 */ 574 U_CAPI void U_EXPORT2 575 uset_add(USet* set, UChar32 c); 576 577 /** 578 * Adds all of the elements in the specified set to this set if 579 * they're not already present. This operation effectively 580 * modifies this set so that its value is the <i>union</i> of the two 581 * sets. The behavior of this operation is unspecified if the specified 582 * collection is modified while the operation is in progress. 583 * A frozen set will not be modified. 584 * 585 * @param set the object to which to add the set 586 * @param additionalSet the source set whose elements are to be added to this set. 587 * @stable ICU 2.6 588 */ 589 U_CAPI void U_EXPORT2 590 uset_addAll(USet* set, const USet *additionalSet); 591 592 /** 593 * Adds the given range of characters to the given USet. After this call, 594 * uset_contains(set, start, end) will return true. 595 * A frozen set will not be modified. 596 * @param set the object to which to add the character 597 * @param start the first character of the range to add, inclusive 598 * @param end the last character of the range to add, inclusive 599 * @stable ICU 2.2 600 */ 601 U_CAPI void U_EXPORT2 602 uset_addRange(USet* set, UChar32 start, UChar32 end); 603 604 /** 605 * Adds the given string to the given USet. After this call, 606 * uset_containsString(set, str, strLen) will return true. 607 * A frozen set will not be modified. 608 * @param set the object to which to add the character 609 * @param str the string to add 610 * @param strLen the length of the string or -1 if null terminated. 611 * @stable ICU 2.4 612 */ 613 U_CAPI void U_EXPORT2 614 uset_addString(USet* set, const UChar* str, int32_t strLen); 615 616 /** 617 * Adds each of the characters in this string to the set. Note: "ch" => {"c", "h"} 618 * If this set already contains any particular character, it has no effect on that character. 619 * A frozen set will not be modified. 620 * @param set the object to which to add the character 621 * @param str the source string 622 * @param strLen the length of the string or -1 if null terminated. 623 * @stable ICU 3.4 624 */ 625 U_CAPI void U_EXPORT2 626 uset_addAllCodePoints(USet* set, const UChar *str, int32_t strLen); 627 628 /** 629 * Removes the given character from the given USet. After this call, 630 * uset_contains(set, c) will return false. 631 * A frozen set will not be modified. 632 * @param set the object from which to remove the character 633 * @param c the character to remove 634 * @stable ICU 2.4 635 */ 636 U_CAPI void U_EXPORT2 637 uset_remove(USet* set, UChar32 c); 638 639 /** 640 * Removes the given range of characters from the given USet. After this call, 641 * uset_contains(set, start, end) will return false. 642 * A frozen set will not be modified. 643 * @param set the object to which to add the character 644 * @param start the first character of the range to remove, inclusive 645 * @param end the last character of the range to remove, inclusive 646 * @stable ICU 2.2 647 */ 648 U_CAPI void U_EXPORT2 649 uset_removeRange(USet* set, UChar32 start, UChar32 end); 650 651 /** 652 * Removes the given string to the given USet. After this call, 653 * uset_containsString(set, str, strLen) will return false. 654 * A frozen set will not be modified. 655 * @param set the object to which to add the character 656 * @param str the string to remove 657 * @param strLen the length of the string or -1 if null terminated. 658 * @stable ICU 2.4 659 */ 660 U_CAPI void U_EXPORT2 661 uset_removeString(USet* set, const UChar* str, int32_t strLen); 662 663 /** 664 * Removes EACH of the characters in this string. Note: "ch" == {"c", "h"} 665 * A frozen set will not be modified. 666 * 667 * @param set the object to be modified 668 * @param str the string 669 * @param length the length of the string, or -1 if NUL-terminated 670 * @stable ICU 69 671 */ 672 U_CAPI void U_EXPORT2 673 uset_removeAllCodePoints(USet *set, const UChar *str, int32_t length); 674 675 /** 676 * Removes from this set all of its elements that are contained in the 677 * specified set. This operation effectively modifies this 678 * set so that its value is the <i>asymmetric set difference</i> of 679 * the two sets. 680 * A frozen set will not be modified. 681 * @param set the object from which the elements are to be removed 682 * @param removeSet the object that defines which elements will be 683 * removed from this set 684 * @stable ICU 3.2 685 */ 686 U_CAPI void U_EXPORT2 687 uset_removeAll(USet* set, const USet* removeSet); 688 689 /** 690 * Retain only the elements in this set that are contained in the 691 * specified range. If <code>start > end</code> then an empty range is 692 * retained, leaving the set empty. This is equivalent to 693 * a boolean logic AND, or a set INTERSECTION. 694 * A frozen set will not be modified. 695 * 696 * @param set the object for which to retain only the specified range 697 * @param start first character, inclusive, of range 698 * @param end last character, inclusive, of range 699 * @stable ICU 3.2 700 */ 701 U_CAPI void U_EXPORT2 702 uset_retain(USet* set, UChar32 start, UChar32 end); 703 704 /** 705 * Retains only the specified string from this set if it is present. 706 * Upon return this set will be empty if it did not contain s, or 707 * will only contain s if it did contain s. 708 * A frozen set will not be modified. 709 * 710 * @param set the object to be modified 711 * @param str the string 712 * @param length the length of the string, or -1 if NUL-terminated 713 * @stable ICU 69 714 */ 715 U_CAPI void U_EXPORT2 716 uset_retainString(USet *set, const UChar *str, int32_t length); 717 718 /** 719 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"} 720 * A frozen set will not be modified. 721 * 722 * @param set the object to be modified 723 * @param str the string 724 * @param length the length of the string, or -1 if NUL-terminated 725 * @stable ICU 69 726 */ 727 U_CAPI void U_EXPORT2 728 uset_retainAllCodePoints(USet *set, const UChar *str, int32_t length); 729 730 /** 731 * Retains only the elements in this set that are contained in the 732 * specified set. In other words, removes from this set all of 733 * its elements that are not contained in the specified set. This 734 * operation effectively modifies this set so that its value is 735 * the <i>intersection</i> of the two sets. 736 * A frozen set will not be modified. 737 * 738 * @param set the object on which to perform the retain 739 * @param retain set that defines which elements this set will retain 740 * @stable ICU 3.2 741 */ 742 U_CAPI void U_EXPORT2 743 uset_retainAll(USet* set, const USet* retain); 744 745 /** 746 * Reallocate this objects internal structures to take up the least 747 * possible space, without changing this object's value. 748 * A frozen set will not be modified. 749 * 750 * @param set the object on which to perform the compact 751 * @stable ICU 3.2 752 */ 753 U_CAPI void U_EXPORT2 754 uset_compact(USet* set); 755 756 /** 757 * This is equivalent to 758 * <code>uset_complementRange(set, 0, 0x10FFFF)</code>. 759 * 760 * <strong>Note:</strong> This performs a symmetric difference with all code points 761 * <em>and thus retains all multicharacter strings</em>. 762 * In order to achieve a “code point complement” (all code points minus this set), 763 * the easiest is to <code>uset_complement(set); uset_removeAllStrings(set);</code>. 764 * 765 * A frozen set will not be modified. 766 * @param set the set 767 * @stable ICU 2.4 768 */ 769 U_CAPI void U_EXPORT2 770 uset_complement(USet* set); 771 772 /** 773 * Complements the specified range in this set. Any character in 774 * the range will be removed if it is in this set, or will be 775 * added if it is not in this set. If <code>start > end</code> 776 * then an empty range is complemented, leaving the set unchanged. 777 * This is equivalent to a boolean logic XOR. 778 * A frozen set will not be modified. 779 * 780 * @param set the object to be modified 781 * @param start first character, inclusive, of range 782 * @param end last character, inclusive, of range 783 * @stable ICU 69 784 */ 785 U_CAPI void U_EXPORT2 786 uset_complementRange(USet *set, UChar32 start, UChar32 end); 787 788 /** 789 * Complements the specified string in this set. 790 * The string will be removed if it is in this set, or will be added if it is not in this set. 791 * A frozen set will not be modified. 792 * 793 * @param set the object to be modified 794 * @param str the string 795 * @param length the length of the string, or -1 if NUL-terminated 796 * @stable ICU 69 797 */ 798 U_CAPI void U_EXPORT2 799 uset_complementString(USet *set, const UChar *str, int32_t length); 800 801 /** 802 * Complements EACH of the characters in this string. Note: "ch" == {"c", "h"} 803 * A frozen set will not be modified. 804 * 805 * @param set the object to be modified 806 * @param str the string 807 * @param length the length of the string, or -1 if NUL-terminated 808 * @stable ICU 69 809 */ 810 U_CAPI void U_EXPORT2 811 uset_complementAllCodePoints(USet *set, const UChar *str, int32_t length); 812 813 /** 814 * Complements in this set all elements contained in the specified 815 * set. Any character in the other set will be removed if it is 816 * in this set, or will be added if it is not in this set. 817 * A frozen set will not be modified. 818 * 819 * @param set the set with which to complement 820 * @param complement set that defines which elements will be xor'ed 821 * from this set. 822 * @stable ICU 3.2 823 */ 824 U_CAPI void U_EXPORT2 825 uset_complementAll(USet* set, const USet* complement); 826 827 /** 828 * Removes all of the elements from this set. This set will be 829 * empty after this call returns. 830 * A frozen set will not be modified. 831 * @param set the set 832 * @stable ICU 2.4 833 */ 834 U_CAPI void U_EXPORT2 835 uset_clear(USet* set); 836 837 /** 838 * Close this set over the given attribute. For the attribute 839 * USET_CASE_INSENSITIVE, the result is to modify this set so that: 840 * 841 * 1. For each character or string 'a' in this set, all strings or 842 * characters 'b' such that foldCase(a) == foldCase(b) are added 843 * to this set. 844 * 845 * 2. For each string 'e' in the resulting set, if e != 846 * foldCase(e), 'e' will be removed. 847 * 848 * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}] 849 * 850 * (Here foldCase(x) refers to the operation u_strFoldCase, and a 851 * == b denotes that the contents are the same, not pointer 852 * comparison.) 853 * 854 * A frozen set will not be modified. 855 * 856 * @param set the set 857 * 858 * @param attributes bitmask for attributes to close over. 859 * Valid options: 860 * At most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE. 861 * These case options are mutually exclusive. 862 * Unrelated options bits are ignored. 863 * @stable ICU 4.2 864 */ 865 U_CAPI void U_EXPORT2 866 uset_closeOver(USet* set, int32_t attributes); 867 868 /** 869 * Remove all strings from this set. 870 * 871 * @param set the set 872 * @stable ICU 4.2 873 */ 874 U_CAPI void U_EXPORT2 875 uset_removeAllStrings(USet* set); 876 877 /** 878 * Returns true if the given USet contains no characters and no 879 * strings. 880 * @param set the set 881 * @return true if set is empty 882 * @stable ICU 2.4 883 */ 884 U_CAPI UBool U_EXPORT2 885 uset_isEmpty(const USet* set); 886 887 /** 888 * @param set the set 889 * @return true if this set contains multi-character strings or the empty string. 890 * @stable ICU 70 891 */ 892 U_CAPI UBool U_EXPORT2 893 uset_hasStrings(const USet *set); 894 895 /** 896 * Returns true if the given USet contains the given character. 897 * This function works faster with a frozen set. 898 * @param set the set 899 * @param c The codepoint to check for within the set 900 * @return true if set contains c 901 * @stable ICU 2.4 902 */ 903 U_CAPI UBool U_EXPORT2 904 uset_contains(const USet* set, UChar32 c); 905 906 /** 907 * Returns true if the given USet contains all characters c 908 * where start <= c && c <= end. 909 * @param set the set 910 * @param start the first character of the range to test, inclusive 911 * @param end the last character of the range to test, inclusive 912 * @return true if set contains the range 913 * @stable ICU 2.2 914 */ 915 U_CAPI UBool U_EXPORT2 916 uset_containsRange(const USet* set, UChar32 start, UChar32 end); 917 918 /** 919 * Returns true if the given USet contains the given string. 920 * @param set the set 921 * @param str the string 922 * @param strLen the length of the string or -1 if null terminated. 923 * @return true if set contains str 924 * @stable ICU 2.4 925 */ 926 U_CAPI UBool U_EXPORT2 927 uset_containsString(const USet* set, const UChar* str, int32_t strLen); 928 929 /** 930 * Returns the index of the given character within this set, where 931 * the set is ordered by ascending code point. If the character 932 * is not in this set, return -1. The inverse of this method is 933 * <code>charAt()</code>. 934 * @param set the set 935 * @param c the character to obtain the index for 936 * @return an index from 0..size()-1, or -1 937 * @stable ICU 3.2 938 */ 939 U_CAPI int32_t U_EXPORT2 940 uset_indexOf(const USet* set, UChar32 c); 941 942 /** 943 * Returns the character at the given index within this set, where 944 * the set is ordered by ascending code point. If the index is 945 * out of range for characters, returns (UChar32)-1. 946 * The inverse of this method is <code>indexOf()</code>. 947 * 948 * For iteration, this is slower than uset_getRangeCount()/uset_getItemCount() 949 * with uset_getItem(), because for each call it skips linearly over <code>index</code> 950 * characters in the ranges. 951 * 952 * @param set the set 953 * @param charIndex an index from 0..size()-1 to obtain the char for 954 * @return the character at the given index, or (UChar32)-1. 955 * @stable ICU 3.2 956 */ 957 U_CAPI UChar32 U_EXPORT2 958 uset_charAt(const USet* set, int32_t charIndex); 959 960 /** 961 * Returns the number of characters and strings contained in this set. 962 * The last uset_getStringCount() == (uset_getItemCount() - uset_getRangeCount()) items are strings. 963 * 964 * This is slower than uset_getRangeCount() and uset_getItemCount() because 965 * it counts the code points of all ranges. 966 * 967 * @param set the set 968 * @return a non-negative integer counting the characters and strings 969 * contained in set 970 * @stable ICU 2.4 971 * @see uset_getRangeCount 972 * @see uset_getStringCount 973 * @see uset_getItemCount 974 */ 975 U_CAPI int32_t U_EXPORT2 976 uset_size(const USet* set); 977 978 /** 979 * @param set the set 980 * @return the number of ranges in this set. 981 * @stable ICU 70 982 * @see uset_getItemCount 983 * @see uset_getItem 984 * @see uset_getStringCount 985 * @see uset_size 986 */ 987 U_CAPI int32_t U_EXPORT2 988 uset_getRangeCount(const USet *set); 989 990 /** 991 * @param set the set 992 * @return the number of strings in this set. 993 * @stable ICU 76 994 * @see uset_getRangeCount 995 * @see uset_getItemCount 996 * @see uset_size 997 */ 998 U_CAPI int32_t U_EXPORT2 999 uset_getStringCount(const USet *set); 1000 1001 /** 1002 * Returns the index-th string (empty or multi-character) in the set. 1003 * The string may not be NUL-terminated. 1004 * The output length must be used, and the caller must not read more than that many UChars. 1005 * 1006 * @param set the set 1007 * @param index the string index, 0 .. uset_getStringCount() - 1 1008 * @param pLength the output string length; must not be NULL 1009 * @return the pointer to the string; NULL if the index is out of range or pLength is NULL 1010 * @stable ICU 76 1011 * @see uset_getStringCount 1012 */ 1013 U_CAPI const UChar* U_EXPORT2 1014 uset_getString(const USet *set, int32_t index, int32_t *pLength); 1015 1016 /** 1017 * Returns the number of items in this set. An item is either a range 1018 * of characters or a single multicharacter string. 1019 * @param set the set 1020 * @return a non-negative integer counting the character ranges 1021 * and/or strings contained in set 1022 * @stable ICU 2.4 1023 * @see uset_getRangeCount 1024 * @see uset_getStringCount 1025 */ 1026 U_CAPI int32_t U_EXPORT2 1027 uset_getItemCount(const USet* set); 1028 1029 /** 1030 * Returns an item of this set. An item is either a range of 1031 * characters or a single multicharacter string (which can be the empty string). 1032 * 1033 * If <code>itemIndex</code> is less than uset_getRangeCount(), then this function returns 0, 1034 * and the range is <code>*start</code>..<code>*end</code>. 1035 * 1036 * If <code>itemIndex</code> is at least uset_getRangeCount() and less than uset_getItemCount(), then 1037 * this function copies the string into <code>str[strCapacity]</code> and 1038 * returns the length of the string (0 for the empty string). 1039 * See uset_getString() for a function that does not copy the string contents. 1040 * 1041 * If <code>itemIndex</code> is out of range, then this function returns -1. 1042 * 1043 * Note that 0 is returned for each range as well as for the empty string. 1044 * 1045 * @param set the set 1046 * @param itemIndex a non-negative integer in the range 0..uset_getItemCount(set)-1 1047 * @param start pointer to variable to receive first character in range, inclusive; 1048 * can be NULL for a string item 1049 * @param end pointer to variable to receive last character in range, inclusive; 1050 * can be NULL for a string item 1051 * @param str buffer to receive the string, may be NULL 1052 * @param strCapacity capacity of str, or 0 if str is NULL 1053 * @param ec error code; U_INDEX_OUTOFBOUNDS_ERROR if the itemIndex is out of range 1054 * @return the length of the string (0 or >= 2), or 0 if the item is a range, 1055 * or -1 if the itemIndex is out of range 1056 * @stable ICU 2.4 1057 * @see uset_getString 1058 */ 1059 U_CAPI int32_t U_EXPORT2 1060 uset_getItem(const USet* set, int32_t itemIndex, 1061 UChar32* start, UChar32* end, 1062 UChar* str, int32_t strCapacity, 1063 UErrorCode* ec); 1064 1065 /** 1066 * Returns true if set1 contains all the characters and strings 1067 * of set2. It answers the question, 'Is set1 a superset of set2?' 1068 * @param set1 set to be checked for containment 1069 * @param set2 set to be checked for containment 1070 * @return true if the test condition is met 1071 * @stable ICU 3.2 1072 */ 1073 U_CAPI UBool U_EXPORT2 1074 uset_containsAll(const USet* set1, const USet* set2); 1075 1076 /** 1077 * Returns true if this set contains all the characters 1078 * of the given string. This is does not check containment of grapheme 1079 * clusters, like uset_containsString. 1080 * @param set set of characters to be checked for containment 1081 * @param str string containing codepoints to be checked for containment 1082 * @param strLen the length of the string or -1 if null terminated. 1083 * @return true if the test condition is met 1084 * @stable ICU 3.4 1085 */ 1086 U_CAPI UBool U_EXPORT2 1087 uset_containsAllCodePoints(const USet* set, const UChar *str, int32_t strLen); 1088 1089 /** 1090 * Returns true if set1 contains none of the characters and strings 1091 * of set2. It answers the question, 'Is set1 a disjoint set of set2?' 1092 * @param set1 set to be checked for containment 1093 * @param set2 set to be checked for containment 1094 * @return true if the test condition is met 1095 * @stable ICU 3.2 1096 */ 1097 U_CAPI UBool U_EXPORT2 1098 uset_containsNone(const USet* set1, const USet* set2); 1099 1100 /** 1101 * Returns true if set1 contains some of the characters and strings 1102 * of set2. It answers the question, 'Does set1 and set2 have an intersection?' 1103 * @param set1 set to be checked for containment 1104 * @param set2 set to be checked for containment 1105 * @return true if the test condition is met 1106 * @stable ICU 3.2 1107 */ 1108 U_CAPI UBool U_EXPORT2 1109 uset_containsSome(const USet* set1, const USet* set2); 1110 1111 /** 1112 * Returns the length of the initial substring of the input string which 1113 * consists only of characters and strings that are contained in this set 1114 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 1115 * or only of characters and strings that are not contained 1116 * in this set (USET_SPAN_NOT_CONTAINED). 1117 * See USetSpanCondition for details. 1118 * Similar to the strspn() C library function. 1119 * Unpaired surrogates are treated according to contains() of their surrogate code points. 1120 * This function works faster with a frozen set and with a non-negative string length argument. 1121 * @param set the set 1122 * @param s start of the string 1123 * @param length of the string; can be -1 for NUL-terminated 1124 * @param spanCondition specifies the containment condition 1125 * @return the length of the initial substring according to the spanCondition; 1126 * 0 if the start of the string does not fit the spanCondition 1127 * @stable ICU 3.8 1128 * @see USetSpanCondition 1129 */ 1130 U_CAPI int32_t U_EXPORT2 1131 uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition); 1132 1133 /** 1134 * Returns the start of the trailing substring of the input string which 1135 * consists only of characters and strings that are contained in this set 1136 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 1137 * or only of characters and strings that are not contained 1138 * in this set (USET_SPAN_NOT_CONTAINED). 1139 * See USetSpanCondition for details. 1140 * Unpaired surrogates are treated according to contains() of their surrogate code points. 1141 * This function works faster with a frozen set and with a non-negative string length argument. 1142 * @param set the set 1143 * @param s start of the string 1144 * @param length of the string; can be -1 for NUL-terminated 1145 * @param spanCondition specifies the containment condition 1146 * @return the start of the trailing substring according to the spanCondition; 1147 * the string length if the end of the string does not fit the spanCondition 1148 * @stable ICU 3.8 1149 * @see USetSpanCondition 1150 */ 1151 U_CAPI int32_t U_EXPORT2 1152 uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition); 1153 1154 /** 1155 * Returns the length of the initial substring of the input string which 1156 * consists only of characters and strings that are contained in this set 1157 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 1158 * or only of characters and strings that are not contained 1159 * in this set (USET_SPAN_NOT_CONTAINED). 1160 * See USetSpanCondition for details. 1161 * Similar to the strspn() C library function. 1162 * Malformed byte sequences are treated according to contains(0xfffd). 1163 * This function works faster with a frozen set and with a non-negative string length argument. 1164 * @param set the set 1165 * @param s start of the string (UTF-8) 1166 * @param length of the string; can be -1 for NUL-terminated 1167 * @param spanCondition specifies the containment condition 1168 * @return the length of the initial substring according to the spanCondition; 1169 * 0 if the start of the string does not fit the spanCondition 1170 * @stable ICU 3.8 1171 * @see USetSpanCondition 1172 */ 1173 U_CAPI int32_t U_EXPORT2 1174 uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition); 1175 1176 /** 1177 * Returns the start of the trailing substring of the input string which 1178 * consists only of characters and strings that are contained in this set 1179 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 1180 * or only of characters and strings that are not contained 1181 * in this set (USET_SPAN_NOT_CONTAINED). 1182 * See USetSpanCondition for details. 1183 * Malformed byte sequences are treated according to contains(0xfffd). 1184 * This function works faster with a frozen set and with a non-negative string length argument. 1185 * @param set the set 1186 * @param s start of the string (UTF-8) 1187 * @param length of the string; can be -1 for NUL-terminated 1188 * @param spanCondition specifies the containment condition 1189 * @return the start of the trailing substring according to the spanCondition; 1190 * the string length if the end of the string does not fit the spanCondition 1191 * @stable ICU 3.8 1192 * @see USetSpanCondition 1193 */ 1194 U_CAPI int32_t U_EXPORT2 1195 uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition); 1196 1197 /** 1198 * Returns true if set1 contains all of the characters and strings 1199 * of set2, and vis versa. It answers the question, 'Is set1 equal to set2?' 1200 * @param set1 set to be checked for containment 1201 * @param set2 set to be checked for containment 1202 * @return true if the test condition is met 1203 * @stable ICU 3.2 1204 */ 1205 U_CAPI UBool U_EXPORT2 1206 uset_equals(const USet* set1, const USet* set2); 1207 1208 /********************************************************************* 1209 * Serialized set API 1210 *********************************************************************/ 1211 1212 /** 1213 * Serializes this set into an array of 16-bit integers. Serialization 1214 * (currently) only records the characters in the set; multicharacter 1215 * strings are ignored. 1216 * 1217 * The array 1218 * has following format (each line is one 16-bit integer): 1219 * 1220 * length = (n+2*m) | (m!=0?0x8000:0) 1221 * bmpLength = n; present if m!=0 1222 * bmp[0] 1223 * bmp[1] 1224 * ... 1225 * bmp[n-1] 1226 * supp-high[0] 1227 * supp-low[0] 1228 * supp-high[1] 1229 * supp-low[1] 1230 * ... 1231 * supp-high[m-1] 1232 * supp-low[m-1] 1233 * 1234 * The array starts with a header. After the header are n bmp 1235 * code points, then m supplementary code points. Either n or m 1236 * or both may be zero. n+2*m is always <= 0x7FFF. 1237 * 1238 * If there are no supplementary characters (if m==0) then the 1239 * header is one 16-bit integer, 'length', with value n. 1240 * 1241 * If there are supplementary characters (if m!=0) then the header 1242 * is two 16-bit integers. The first, 'length', has value 1243 * (n+2*m)|0x8000. The second, 'bmpLength', has value n. 1244 * 1245 * After the header the code points are stored in ascending order. 1246 * Supplementary code points are stored as most significant 16 1247 * bits followed by least significant 16 bits. 1248 * 1249 * @param set the set 1250 * @param dest pointer to buffer of destCapacity 16-bit integers. 1251 * May be NULL only if destCapacity is zero. 1252 * @param destCapacity size of dest, or zero. Must not be negative. 1253 * @param pErrorCode pointer to the error code. Will be set to 1254 * U_INDEX_OUTOFBOUNDS_ERROR if n+2*m > 0x7FFF. Will be set to 1255 * U_BUFFER_OVERFLOW_ERROR if n+2*m+(m!=0?2:1) > destCapacity. 1256 * @return the total length of the serialized format, including 1257 * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other 1258 * than U_BUFFER_OVERFLOW_ERROR. 1259 * @stable ICU 2.4 1260 */ 1261 U_CAPI int32_t U_EXPORT2 1262 uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* pErrorCode); 1263 1264 /** 1265 * Given a serialized array, fill in the given serialized set object. 1266 * @param fillSet pointer to result 1267 * @param src pointer to start of array 1268 * @param srcLength length of array 1269 * @return true if the given array is valid, otherwise false 1270 * @stable ICU 2.4 1271 */ 1272 U_CAPI UBool U_EXPORT2 1273 uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength); 1274 1275 /** 1276 * Set the USerializedSet to contain the given character (and nothing 1277 * else). 1278 * @param fillSet pointer to result 1279 * @param c The codepoint to set 1280 * @stable ICU 2.4 1281 */ 1282 U_CAPI void U_EXPORT2 1283 uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c); 1284 1285 /** 1286 * Returns true if the given USerializedSet contains the given 1287 * character. 1288 * @param set the serialized set 1289 * @param c The codepoint to check for within the set 1290 * @return true if set contains c 1291 * @stable ICU 2.4 1292 */ 1293 U_CAPI UBool U_EXPORT2 1294 uset_serializedContains(const USerializedSet* set, UChar32 c); 1295 1296 /** 1297 * Returns the number of disjoint ranges of characters contained in 1298 * the given serialized set. Ignores any strings contained in the 1299 * set. 1300 * @param set the serialized set 1301 * @return a non-negative integer counting the character ranges 1302 * contained in set 1303 * @stable ICU 2.4 1304 */ 1305 U_CAPI int32_t U_EXPORT2 1306 uset_getSerializedRangeCount(const USerializedSet* set); 1307 1308 /** 1309 * Returns a range of characters contained in the given serialized 1310 * set. 1311 * @param set the serialized set 1312 * @param rangeIndex a non-negative integer in the range 0.. 1313 * uset_getSerializedRangeCount(set)-1 1314 * @param pStart pointer to variable to receive first character 1315 * in range, inclusive 1316 * @param pEnd pointer to variable to receive last character in range, 1317 * inclusive 1318 * @return true if rangeIndex is valid, otherwise false 1319 * @stable ICU 2.4 1320 */ 1321 U_CAPI UBool U_EXPORT2 1322 uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex, 1323 UChar32* pStart, UChar32* pEnd); 1324 1325 #if U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API 1326 1327 namespace U_HEADER_ONLY_NAMESPACE { 1328 1329 // Note: Not U_COMMON_API, and not a subclass of UMemory, because this is a header-only class, 1330 // not intended to be used via export from the ICU DLL. 1331 1332 /** 1333 * Iterator returned by USetCodePoints. 1334 * @stable ICU 76 1335 */ 1336 class USetCodePointIterator { 1337 public: 1338 /** @stable ICU 76 */ 1339 USetCodePointIterator(const USetCodePointIterator &other) = default; 1340 1341 /** @stable ICU 76 */ 1342 bool operator==(const USetCodePointIterator &other) const { 1343 // No need to compare rangeCount & end given private constructor 1344 // and assuming we don't compare iterators across the set being modified. 1345 // And comparing rangeIndex is redundant with comparing c. 1346 // We might even skip comparing uset. 1347 // Unless we want operator==() to be "correct" for more than iteration. 1348 return uset == other.uset && c == other.c; 1349 } 1350 1351 /** @stable ICU 76 */ 1352 bool operator!=(const USetCodePointIterator &other) const { return !operator==(other); } 1353 1354 /** @stable ICU 76 */ 1355 UChar32 operator*() const { return c; } 1356 1357 /** 1358 * Pre-increment. 1359 * @stable ICU 76 1360 */ 1361 USetCodePointIterator &operator++() { 1362 if (c < end) { 1363 ++c; 1364 } else if (rangeIndex < rangeCount) { 1365 UErrorCode errorCode = U_ZERO_ERROR; 1366 int32_t result = uset_getItem(uset, rangeIndex, &c, &end, nullptr, 0, &errorCode); 1367 if (U_SUCCESS(errorCode) && result == 0) { 1368 ++rangeIndex; 1369 } else { 1370 c = end = U_SENTINEL; 1371 } 1372 } else { 1373 c = end = U_SENTINEL; 1374 } 1375 return *this; 1376 } 1377 1378 /** 1379 * Post-increment. 1380 * @stable ICU 76 1381 */ 1382 USetCodePointIterator operator++(int) { 1383 USetCodePointIterator result(*this); 1384 operator++(); 1385 return result; 1386 } 1387 1388 private: 1389 friend class USetCodePoints; 1390 1391 USetCodePointIterator(const USet *pUset, int32_t nRangeIndex, int32_t nRangeCount) 1392 : uset(pUset), rangeIndex(nRangeIndex), rangeCount(nRangeCount), 1393 c(U_SENTINEL), end(U_SENTINEL) { 1394 // Fetch the first range. 1395 operator++(); 1396 } 1397 1398 const USet *uset; 1399 int32_t rangeIndex; 1400 int32_t rangeCount; 1401 UChar32 c, end; 1402 }; 1403 1404 /** 1405 * C++ "range" for iterating over the code points of a USet. 1406 * 1407 * \code 1408 * using U_HEADER_NESTED_NAMESPACE::USetCodePoints; 1409 * LocalUSetPointer uset(uset_openPattern(u"[abcçカ🚴]", -1, &errorCode)); 1410 * for (UChar32 c : USetCodePoints(uset.getAlias())) { 1411 * printf("uset.codePoint U+%04lx\n", (long)c); 1412 * } 1413 * \endcode 1414 * 1415 * C++ UnicodeSet has member functions for iteration, including codePoints(). 1416 * 1417 * @stable ICU 76 1418 * @see USetRanges 1419 * @see USetStrings 1420 * @see USetElements 1421 */ 1422 class USetCodePoints { 1423 public: 1424 /** 1425 * Constructs a C++ "range" object over the code points of the USet. 1426 * @stable ICU 76 1427 */ 1428 USetCodePoints(const USet *pUset) : uset(pUset), rangeCount(uset_getRangeCount(pUset)) {} 1429 1430 /** @stable ICU 76 */ 1431 USetCodePoints(const USetCodePoints &other) = default; 1432 1433 /** @stable ICU 76 */ 1434 USetCodePointIterator begin() const { 1435 return USetCodePointIterator(uset, 0, rangeCount); 1436 } 1437 1438 /** @stable ICU 76 */ 1439 USetCodePointIterator end() const { 1440 return USetCodePointIterator(uset, rangeCount, rangeCount); 1441 } 1442 1443 private: 1444 const USet *uset; 1445 int32_t rangeCount; 1446 }; 1447 1448 /** 1449 * A contiguous range of code points in a USet/UnicodeSet. 1450 * Returned by USetRangeIterator which is returned by USetRanges. 1451 * Both the rangeStart and rangeEnd are in the range. 1452 * (end() returns an iterator corresponding to rangeEnd+1.) 1453 * @stable ICU 76 1454 */ 1455 struct CodePointRange { 1456 /** @stable ICU 76 */ 1457 struct iterator { 1458 /** @stable ICU 76 */ 1459 iterator(UChar32 aC) : c(aC) {} 1460 1461 /** @stable ICU 76 */ 1462 bool operator==(const iterator &other) const { return c == other.c; } 1463 /** @stable ICU 76 */ 1464 bool operator!=(const iterator &other) const { return !operator==(other); } 1465 1466 /** @stable ICU 76 */ 1467 UChar32 operator*() const { return c; } 1468 1469 /** 1470 * Pre-increment. 1471 * @stable ICU 76 1472 */ 1473 iterator &operator++() { 1474 ++c; 1475 return *this; 1476 } 1477 1478 /** 1479 * Post-increment. 1480 * @stable ICU 76 1481 */ 1482 iterator operator++(int) { 1483 return c++; 1484 } 1485 1486 /** 1487 * The current code point in the range. 1488 * @stable ICU 76 1489 */ 1490 UChar32 c; 1491 }; 1492 1493 /** @stable ICU 76 */ 1494 CodePointRange(UChar32 start, UChar32 end) : rangeStart(start), rangeEnd(end) {} 1495 /** @stable ICU 76 */ 1496 CodePointRange(const CodePointRange &other) = default; 1497 /** @stable ICU 76 */ 1498 size_t size() const { return (rangeEnd + 1) - rangeStart; } 1499 /** @stable ICU 76 */ 1500 iterator begin() const { return rangeStart; } 1501 /** @stable ICU 76 */ 1502 iterator end() const { return rangeEnd + 1; } 1503 1504 /** 1505 * Start of a USet/UnicodeSet range of code points. 1506 * @stable ICU 76 1507 */ 1508 UChar32 rangeStart; 1509 /** 1510 * Inclusive end of a USet/UnicodeSet range of code points. 1511 * @stable ICU 76 1512 */ 1513 UChar32 rangeEnd; 1514 }; 1515 1516 /** 1517 * Iterator returned by USetRanges. 1518 * @stable ICU 76 1519 */ 1520 class USetRangeIterator { 1521 public: 1522 /** @stable ICU 76 */ 1523 USetRangeIterator(const USetRangeIterator &other) = default; 1524 1525 /** @stable ICU 76 */ 1526 bool operator==(const USetRangeIterator &other) const { 1527 // No need to compare rangeCount given private constructor 1528 // and assuming we don't compare iterators across the set being modified. 1529 // We might even skip comparing uset. 1530 // Unless we want operator==() to be "correct" for more than iteration. 1531 return uset == other.uset && rangeIndex == other.rangeIndex; 1532 } 1533 1534 /** @stable ICU 76 */ 1535 bool operator!=(const USetRangeIterator &other) const { return !operator==(other); } 1536 1537 /** @stable ICU 76 */ 1538 CodePointRange operator*() const { 1539 if (rangeIndex < rangeCount) { 1540 UChar32 start, end; 1541 UErrorCode errorCode = U_ZERO_ERROR; 1542 int32_t result = uset_getItem(uset, rangeIndex, &start, &end, nullptr, 0, &errorCode); 1543 if (U_SUCCESS(errorCode) && result == 0) { 1544 return CodePointRange(start, end); 1545 } 1546 } 1547 return CodePointRange(U_SENTINEL, U_SENTINEL); 1548 } 1549 1550 /** 1551 * Pre-increment. 1552 * @stable ICU 76 1553 */ 1554 USetRangeIterator &operator++() { 1555 ++rangeIndex; 1556 return *this; 1557 } 1558 1559 /** 1560 * Post-increment. 1561 * @stable ICU 76 1562 */ 1563 USetRangeIterator operator++(int) { 1564 USetRangeIterator result(*this); 1565 ++rangeIndex; 1566 return result; 1567 } 1568 1569 private: 1570 friend class USetRanges; 1571 1572 USetRangeIterator(const USet *pUset, int32_t nRangeIndex, int32_t nRangeCount) 1573 : uset(pUset), rangeIndex(nRangeIndex), rangeCount(nRangeCount) {} 1574 1575 const USet *uset; 1576 int32_t rangeIndex; 1577 int32_t rangeCount; 1578 }; 1579 1580 /** 1581 * C++ "range" for iterating over the code point ranges of a USet. 1582 * 1583 * \code 1584 * using U_HEADER_NESTED_NAMESPACE::USetRanges; 1585 * LocalUSetPointer uset(uset_openPattern(u"[abcçカ🚴]", -1, &errorCode)); 1586 * for (auto [start, end] : USetRanges(uset.getAlias())) { 1587 * printf("uset.range U+%04lx..U+%04lx\n", (long)start, (long)end); 1588 * } 1589 * for (auto range : USetRanges(uset.getAlias())) { 1590 * for (UChar32 c : range) { 1591 * printf("uset.range.c U+%04lx\n", (long)c); 1592 * } 1593 * } 1594 * \endcode 1595 * 1596 * C++ UnicodeSet has member functions for iteration, including ranges(). 1597 * 1598 * @stable ICU 76 1599 * @see USetCodePoints 1600 * @see USetStrings 1601 * @see USetElements 1602 */ 1603 class USetRanges { 1604 public: 1605 /** 1606 * Constructs a C++ "range" object over the code point ranges of the USet. 1607 * @stable ICU 76 1608 */ 1609 USetRanges(const USet *pUset) : uset(pUset), rangeCount(uset_getRangeCount(pUset)) {} 1610 1611 /** @stable ICU 76 */ 1612 USetRanges(const USetRanges &other) = default; 1613 1614 /** @stable ICU 76 */ 1615 USetRangeIterator begin() const { 1616 return USetRangeIterator(uset, 0, rangeCount); 1617 } 1618 1619 /** @stable ICU 76 */ 1620 USetRangeIterator end() const { 1621 return USetRangeIterator(uset, rangeCount, rangeCount); 1622 } 1623 1624 private: 1625 const USet *uset; 1626 int32_t rangeCount; 1627 }; 1628 1629 /** 1630 * Iterator returned by USetStrings. 1631 * @stable ICU 76 1632 */ 1633 class USetStringIterator { 1634 public: 1635 /** @stable ICU 76 */ 1636 USetStringIterator(const USetStringIterator &other) = default; 1637 1638 /** @stable ICU 76 */ 1639 bool operator==(const USetStringIterator &other) const { 1640 // No need to compare count given private constructor 1641 // and assuming we don't compare iterators across the set being modified. 1642 // We might even skip comparing uset. 1643 // Unless we want operator==() to be "correct" for more than iteration. 1644 return uset == other.uset && index == other.index; 1645 } 1646 1647 /** @stable ICU 76 */ 1648 bool operator!=(const USetStringIterator &other) const { return !operator==(other); } 1649 1650 /** @stable ICU 76 */ 1651 std::u16string_view operator*() const { 1652 if (index < count) { 1653 int32_t length; 1654 const UChar *uchars = uset_getString(uset, index, &length); 1655 // assert uchars != nullptr; 1656 return {uprv_char16PtrFromUChar(uchars), static_cast<size_t>(length)}; 1657 } 1658 return {}; 1659 } 1660 1661 /** 1662 * Pre-increment. 1663 * @stable ICU 76 1664 */ 1665 USetStringIterator &operator++() { 1666 ++index; 1667 return *this; 1668 } 1669 1670 /** 1671 * Post-increment. 1672 * @stable ICU 76 1673 */ 1674 USetStringIterator operator++(int) { 1675 USetStringIterator result(*this); 1676 ++index; 1677 return result; 1678 } 1679 1680 private: 1681 friend class USetStrings; 1682 1683 USetStringIterator(const USet *pUset, int32_t nIndex, int32_t nCount) 1684 : uset(pUset), index(nIndex), count(nCount) {} 1685 1686 const USet *uset; 1687 int32_t index; 1688 int32_t count; 1689 }; 1690 1691 /** 1692 * C++ "range" for iterating over the empty and multi-character strings of a USet. 1693 * 1694 * \code 1695 * using U_HEADER_NESTED_NAMESPACE::USetStrings; 1696 * LocalUSetPointer uset(uset_openPattern(u"[abcçカ🚴{}{abc}{de}]", -1, &errorCode)); 1697 * for (auto s : USetStrings(uset.getAlias())) { 1698 * int32_t len32 = s.length(); 1699 * char utf8[200]; 1700 * u_strToUTF8WithSub(utf8, int32_t{sizeof(utf8) - 1}, nullptr, 1701 * s.data(), len32, 0xFFFD, nullptr, errorCode); 1702 * printf("uset.string length %ld \"%s\"\n", long{len32}, utf8); 1703 * } 1704 * \endcode 1705 * 1706 * C++ UnicodeSet has member functions for iteration, including strings(). 1707 * 1708 * @stable ICU 76 1709 * @see USetCodePoints 1710 * @see USetRanges 1711 * @see USetElements 1712 */ 1713 class USetStrings { 1714 public: 1715 /** 1716 * Constructs a C++ "range" object over the strings of the USet. 1717 * @stable ICU 76 1718 */ 1719 USetStrings(const USet *pUset) : uset(pUset), count(uset_getStringCount(pUset)) {} 1720 1721 /** @stable ICU 76 */ 1722 USetStrings(const USetStrings &other) = default; 1723 1724 /** @stable ICU 76 */ 1725 USetStringIterator begin() const { 1726 return USetStringIterator(uset, 0, count); 1727 } 1728 1729 /** @stable ICU 76 */ 1730 USetStringIterator end() const { 1731 return USetStringIterator(uset, count, count); 1732 } 1733 1734 private: 1735 const USet *uset; 1736 int32_t count; 1737 }; 1738 1739 #ifndef U_HIDE_DRAFT_API 1740 /** 1741 * Iterator returned by USetElements. 1742 * @draft ICU 77 1743 */ 1744 class USetElementIterator { 1745 public: 1746 /** @draft ICU 77 */ 1747 USetElementIterator(const USetElementIterator &other) = default; 1748 1749 /** @draft ICU 77 */ 1750 bool operator==(const USetElementIterator &other) const { 1751 // No need to compare rangeCount & end given private constructor 1752 // and assuming we don't compare iterators across the set being modified. 1753 // We might even skip comparing uset. 1754 // Unless we want operator==() to be "correct" for more than iteration. 1755 return uset == other.uset && c == other.c && index == other.index; 1756 } 1757 1758 /** @draft ICU 77 */ 1759 bool operator!=(const USetElementIterator &other) const { return !operator==(other); } 1760 1761 /** @draft ICU 77 */ 1762 std::u16string operator*() const { 1763 if (c >= 0) { 1764 return c <= 0xffff ? 1765 std::u16string({static_cast<char16_t>(c)}) : 1766 std::u16string({U16_LEAD(c), U16_TRAIL(c)}); 1767 } else if (index < totalCount) { 1768 int32_t length; 1769 const UChar *uchars = uset_getString(uset, index - rangeCount, &length); 1770 // assert uchars != nullptr; 1771 return {uprv_char16PtrFromUChar(uchars), static_cast<size_t>(length)}; 1772 } else { 1773 return {}; 1774 } 1775 } 1776 1777 /** 1778 * Pre-increment. 1779 * @draft ICU 77 1780 */ 1781 USetElementIterator &operator++() { 1782 if (c < end) { 1783 ++c; 1784 } else if (index < rangeCount) { 1785 UErrorCode errorCode = U_ZERO_ERROR; 1786 int32_t result = uset_getItem(uset, index, &c, &end, nullptr, 0, &errorCode); 1787 if (U_SUCCESS(errorCode) && result == 0) { 1788 ++index; 1789 } else { 1790 c = end = U_SENTINEL; 1791 } 1792 } else if (c >= 0) { 1793 // assert index == rangeCount; 1794 // Switch from the last range to the first string. 1795 c = end = U_SENTINEL; 1796 } else { 1797 ++index; 1798 } 1799 return *this; 1800 } 1801 1802 /** 1803 * Post-increment. 1804 * @draft ICU 77 1805 */ 1806 USetElementIterator operator++(int) { 1807 USetElementIterator result(*this); 1808 operator++(); 1809 return result; 1810 } 1811 1812 private: 1813 friend class USetElements; 1814 1815 USetElementIterator(const USet *pUset, int32_t nIndex, int32_t nRangeCount, int32_t nTotalCount) 1816 : uset(pUset), index(nIndex), rangeCount(nRangeCount), totalCount(nTotalCount), 1817 c(U_SENTINEL), end(U_SENTINEL) { 1818 if (index < rangeCount) { 1819 // Fetch the first range. 1820 operator++(); 1821 } 1822 // Otherwise don't move beyond the (index - rangeCount)-th string. 1823 } 1824 1825 const USet *uset; 1826 int32_t index; 1827 /** Number of UnicodeSet/USet code point ranges. */ 1828 int32_t rangeCount; 1829 /** 1830 * Number of code point ranges plus number of strings. 1831 * index starts from 0, counts ranges while less than rangeCount, 1832 * then counts strings while at least rangeCount and less than totalCount. 1833 * 1834 * Note that totalCount is the same as uset_getItemCount(), but usually 1835 * smaller than the number of elements returned by this iterator 1836 * because we return each code point of each range. 1837 */ 1838 int32_t totalCount; 1839 UChar32 c, end; 1840 }; 1841 1842 /** 1843 * A C++ "range" for iterating over all of the elements of a USet. 1844 * Convenient all-in one iteration, but creates a std::u16string for each 1845 * code point or string. 1846 * 1847 * Code points are returned first, then empty and multi-character strings. 1848 * 1849 * \code 1850 * using U_HEADER_NESTED_NAMESPACE::USetElements; 1851 * LocalUSetPointer uset(uset_openPattern(u"[abcçカ🚴{}{abc}{de}]", -1, &errorCode)); 1852 * for (auto el : USetElements(uset.getAlias())) { 1853 * int32_t len32 = el.length(); 1854 * char utf8[200]; 1855 * u_strToUTF8WithSub(utf8, int32_t{sizeof(utf8) - 1}, nullptr, 1856 * el.data(), len32, 0xFFFD, nullptr, errorCode); 1857 * printf("uset.element length %ld \"%s\"\n", long{len32}, utf8); 1858 * } 1859 * \endcode 1860 * 1861 * C++ UnicodeSet has member functions for iteration, including begin() and end(). 1862 * 1863 * @return an all-elements iterator. 1864 * @draft ICU 77 1865 * @see USetCodePoints 1866 * @see USetRanges 1867 * @see USetStrings 1868 */ 1869 class USetElements { 1870 public: 1871 /** 1872 * Constructs a C++ "range" object over all of the elements of the USet. 1873 * @draft ICU 77 1874 */ 1875 USetElements(const USet *pUset) 1876 : uset(pUset), rangeCount(uset_getRangeCount(pUset)), 1877 stringCount(uset_getStringCount(pUset)) {} 1878 1879 /** @draft ICU 77 */ 1880 USetElements(const USetElements &other) = default; 1881 1882 /** @draft ICU 77 */ 1883 USetElementIterator begin() const { 1884 return USetElementIterator(uset, 0, rangeCount, rangeCount + stringCount); 1885 } 1886 1887 /** @draft ICU 77 */ 1888 USetElementIterator end() const { 1889 return USetElementIterator(uset, rangeCount + stringCount, rangeCount, rangeCount + stringCount); 1890 } 1891 1892 private: 1893 const USet *uset; 1894 int32_t rangeCount, stringCount; 1895 }; 1896 1897 #endif // U_HIDE_DRAFT_API 1898 1899 } // namespace U_HEADER_ONLY_NAMESPACE 1900 1901 #endif // U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API 1902 1903 #endif // __USET_H__