tblcoll.h (39717B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ****************************************************************************** 5 * Copyright (C) 1996-2016, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ****************************************************************************** 8 */ 9 10 /** 11 * \file 12 * \brief C++ API: The RuleBasedCollator class implements the Collator abstract base class. 13 */ 14 15 /** 16 * File tblcoll.h 17 * 18 * Created by: Helena Shih 19 * 20 * Modification History: 21 * 22 * Date Name Description 23 * 2/5/97 aliu Added streamIn and streamOut methods. Added 24 * constructor which reads RuleBasedCollator object from 25 * a binary file. Added writeToFile method which streams 26 * RuleBasedCollator out to a binary file. The streamIn 27 * and streamOut methods use istream and ostream objects 28 * in binary mode. 29 * 2/12/97 aliu Modified to use TableCollationData sub-object to 30 * hold invariant data. 31 * 2/13/97 aliu Moved several methods into this class from Collation. 32 * Added a private RuleBasedCollator(Locale&) constructor, 33 * to be used by Collator::createDefault(). General 34 * clean up. 35 * 2/20/97 helena Added clone, operator==, operator!=, operator=, and copy 36 * constructor and getDynamicClassID. 37 * 3/5/97 aliu Modified constructFromFile() to add parameter 38 * specifying whether or not binary loading is to be 39 * attempted. This is required for dynamic rule loading. 40 * 05/07/97 helena Added memory allocation error detection. 41 * 6/17/97 helena Added IDENTICAL strength for compare, changed getRules to 42 * use MergeCollation::getPattern. 43 * 6/20/97 helena Java class name change. 44 * 8/18/97 helena Added internal API documentation. 45 * 09/03/97 helena Added createCollationKeyValues(). 46 * 02/10/98 damiba Added compare with "length" parameter 47 * 08/05/98 erm Synched with 1.2 version of RuleBasedCollator.java 48 * 04/23/99 stephen Removed EDecompositionMode, merged with 49 * Normalizer::EMode 50 * 06/14/99 stephen Removed kResourceBundleSuffix 51 * 11/02/99 helena Collator performance enhancements. Eliminates the 52 * UnicodeString construction and special case for NO_OP. 53 * 11/23/99 srl More performance enhancements. Updates to NormalizerIterator 54 * internal state management. 55 * 12/15/99 aliu Update to support Thai collation. Move NormalizerIterator 56 * to implementation file. 57 * 01/29/01 synwee Modified into a C++ wrapper which calls C API 58 * (ucol.h) 59 * 2012-2014 markus Rewritten in C++ again. 60 */ 61 62 #ifndef TBLCOLL_H 63 #define TBLCOLL_H 64 65 #include "unicode/utypes.h" 66 67 #if U_SHOW_CPLUSPLUS_API 68 69 #if !UCONFIG_NO_COLLATION 70 71 #include "unicode/coll.h" 72 #include "unicode/locid.h" 73 #include "unicode/uiter.h" 74 #include "unicode/ucol.h" 75 76 U_NAMESPACE_BEGIN 77 78 struct CollationCacheEntry; 79 struct CollationData; 80 struct CollationSettings; 81 struct CollationTailoring; 82 /** 83 * @stable ICU 2.0 84 */ 85 class StringSearch; 86 /** 87 * @stable ICU 2.0 88 */ 89 class CollationElementIterator; 90 class CollationKey; 91 class SortKeyByteSink; 92 class UnicodeSet; 93 class UnicodeString; 94 class UVector64; 95 96 /** 97 * The RuleBasedCollator class provides the implementation of 98 * Collator, using data-driven tables. The user can create a customized 99 * table-based collation. 100 * <p> 101 * For more information about the collation service see 102 * <a href="https://unicode-org.github.io/icu/userguide/collation">the User Guide</a>. 103 * <p> 104 * Collation service provides correct sorting orders for most locales supported in ICU. 105 * If specific data for a locale is not available, the orders eventually falls back 106 * to the <a href="http://www.unicode.org/reports/tr35/tr35-collation.html#Root_Collation">CLDR root sort order</a>. 107 * <p> 108 * Sort ordering may be customized by providing your own set of rules. For more on 109 * this subject see the <a href="https://unicode-org.github.io/icu/userguide/collation/customization"> 110 * Collation Customization</a> section of the User Guide. 111 * <p> 112 * Note, RuleBasedCollator is not to be subclassed. 113 * @see Collator 114 */ 115 class U_I18N_API_CLASS RuleBasedCollator final : public Collator { 116 public: 117 /** 118 * RuleBasedCollator constructor. This takes the table rules and builds a 119 * collation table out of them. Please see RuleBasedCollator class 120 * description for more details on the collation rule syntax. 121 * @param rules the collation rules to build the collation table from. 122 * @param status reporting a success or an error. 123 * @stable ICU 2.0 124 */ 125 U_I18N_API RuleBasedCollator(const UnicodeString& rules, UErrorCode& status); 126 127 /** 128 * RuleBasedCollator constructor. This takes the table rules and builds a 129 * collation table out of them. Please see RuleBasedCollator class 130 * description for more details on the collation rule syntax. 131 * @param rules the collation rules to build the collation table from. 132 * @param collationStrength strength for comparison 133 * @param status reporting a success or an error. 134 * @stable ICU 2.0 135 */ 136 U_I18N_API RuleBasedCollator(const UnicodeString& rules, 137 ECollationStrength collationStrength, 138 UErrorCode& status); 139 140 /** 141 * RuleBasedCollator constructor. This takes the table rules and builds a 142 * collation table out of them. Please see RuleBasedCollator class 143 * description for more details on the collation rule syntax. 144 * @param rules the collation rules to build the collation table from. 145 * @param decompositionMode the normalisation mode 146 * @param status reporting a success or an error. 147 * @stable ICU 2.0 148 */ 149 U_I18N_API RuleBasedCollator(const UnicodeString& rules, 150 UColAttributeValue decompositionMode, 151 UErrorCode& status); 152 153 /** 154 * RuleBasedCollator constructor. This takes the table rules and builds a 155 * collation table out of them. Please see RuleBasedCollator class 156 * description for more details on the collation rule syntax. 157 * @param rules the collation rules to build the collation table from. 158 * @param collationStrength strength for comparison 159 * @param decompositionMode the normalisation mode 160 * @param status reporting a success or an error. 161 * @stable ICU 2.0 162 */ 163 U_I18N_API RuleBasedCollator(const UnicodeString& rules, 164 ECollationStrength collationStrength, 165 UColAttributeValue decompositionMode, 166 UErrorCode& status); 167 168 #ifndef U_HIDE_INTERNAL_API 169 /** 170 * TODO: document & propose as public API 171 * @internal 172 */ 173 U_I18N_API RuleBasedCollator(const UnicodeString& rules, 174 UParseError& parseError, 175 UnicodeString& reason, 176 UErrorCode& errorCode); 177 #endif /* U_HIDE_INTERNAL_API */ 178 179 /** 180 * Copy constructor. 181 * @param other the RuleBasedCollator object to be copied 182 * @stable ICU 2.0 183 */ 184 U_I18N_API RuleBasedCollator(const RuleBasedCollator& other); 185 186 /** Opens a collator from a collator binary image created using 187 * cloneBinary. Binary image used in instantiation of the 188 * collator remains owned by the user and should stay around for 189 * the lifetime of the collator. The API also takes a base collator 190 * which must be the root collator. 191 * @param bin binary image owned by the user and required through the 192 * lifetime of the collator 193 * @param length size of the image. If negative, the API will try to 194 * figure out the length of the image 195 * @param base Base collator, for lookup of untailored characters. 196 * Must be the root collator, must not be nullptr. 197 * The base is required to be present through the lifetime of the collator. 198 * @param status for catching errors 199 * @return newly created collator 200 * @see cloneBinary 201 * @stable ICU 3.4 202 */ 203 U_I18N_API RuleBasedCollator(const uint8_t* bin, 204 int32_t length, 205 const RuleBasedCollator* base, 206 UErrorCode& status); 207 208 /** 209 * Destructor. 210 * @stable ICU 2.0 211 */ 212 U_I18N_API virtual ~RuleBasedCollator(); 213 214 /** 215 * Assignment operator. 216 * @param other other RuleBasedCollator object to copy from. 217 * @stable ICU 2.0 218 */ 219 U_I18N_API RuleBasedCollator& operator=(const RuleBasedCollator& other); 220 221 /** 222 * Returns true if argument is the same as this object. 223 * @param other Collator object to be compared. 224 * @return true if arguments is the same as this object. 225 * @stable ICU 2.0 226 */ 227 U_I18N_API virtual bool operator==(const Collator& other) const override; 228 229 /** 230 * Makes a copy of this object. 231 * @return a copy of this object, owned by the caller 232 * @stable ICU 2.0 233 */ 234 U_I18N_API virtual RuleBasedCollator* clone() const override; 235 236 /** 237 * Creates a collation element iterator for the source string. The caller of 238 * this method is responsible for the memory management of the return 239 * pointer. 240 * @param source the string over which the CollationElementIterator will 241 * iterate. 242 * @return the collation element iterator of the source string using this as 243 * the based Collator. 244 * @stable ICU 2.2 245 */ 246 U_I18N_API CollationElementIterator* 247 createCollationElementIterator(const UnicodeString& source) const; 248 249 /** 250 * Creates a collation element iterator for the source. The caller of this 251 * method is responsible for the memory management of the returned pointer. 252 * @param source the CharacterIterator which produces the characters over 253 * which the CollationElementItgerator will iterate. 254 * @return the collation element iterator of the source using this as the 255 * based Collator. 256 * @stable ICU 2.2 257 */ 258 U_I18N_API CollationElementIterator* 259 createCollationElementIterator(const CharacterIterator& source) const; 260 261 // Make deprecated versions of Collator::compare() visible. 262 using Collator::compare; 263 264 /** 265 * The comparison function compares the character data stored in two 266 * different strings. Returns information about whether a string is less 267 * than, greater than or equal to another string. 268 * @param source the source string to be compared with. 269 * @param target the string that is to be compared with the source string. 270 * @param status possible error code 271 * @return Returns an enum value. UCOL_GREATER if source is greater 272 * than target; UCOL_EQUAL if source is equal to target; UCOL_LESS if source is less 273 * than target 274 * @stable ICU 2.6 275 **/ 276 U_I18N_API virtual UCollationResult compare(const UnicodeString& source, 277 const UnicodeString& target, 278 UErrorCode& status) const override; 279 280 /** 281 * Does the same thing as compare but limits the comparison to a specified 282 * length 283 * @param source the source string to be compared with. 284 * @param target the string that is to be compared with the source string. 285 * @param length the length the comparison is limited to 286 * @param status possible error code 287 * @return Returns an enum value. UCOL_GREATER if source (up to the specified 288 * length) is greater than target; UCOL_EQUAL if source (up to specified 289 * length) is equal to target; UCOL_LESS if source (up to the specified 290 * length) is less than target. 291 * @stable ICU 2.6 292 */ 293 U_I18N_API virtual UCollationResult compare(const UnicodeString& source, 294 const UnicodeString& target, 295 int32_t length, 296 UErrorCode& status) const override; 297 298 /** 299 * The comparison function compares the character data stored in two 300 * different string arrays. Returns information about whether a string array 301 * is less than, greater than or equal to another string array. 302 * @param source the source string array to be compared with. 303 * @param sourceLength the length of the source string array. If this value 304 * is equal to -1, the string array is null-terminated. 305 * @param target the string that is to be compared with the source string. 306 * @param targetLength the length of the target string array. If this value 307 * is equal to -1, the string array is null-terminated. 308 * @param status possible error code 309 * @return Returns an enum value. UCOL_GREATER if source is greater 310 * than target; UCOL_EQUAL if source is equal to target; UCOL_LESS if source is less 311 * than target 312 * @stable ICU 2.6 313 */ 314 U_I18N_API virtual UCollationResult compare(const char16_t* source, int32_t sourceLength, 315 const char16_t* target, int32_t targetLength, 316 UErrorCode& status) const override; 317 318 /** 319 * Compares two strings using the Collator. 320 * Returns whether the first one compares less than/equal to/greater than 321 * the second one. 322 * This version takes UCharIterator input. 323 * @param sIter the first ("source") string iterator 324 * @param tIter the second ("target") string iterator 325 * @param status ICU status 326 * @return UCOL_LESS, UCOL_EQUAL or UCOL_GREATER 327 * @stable ICU 4.2 328 */ 329 U_I18N_API virtual UCollationResult compare(UCharIterator& sIter, 330 UCharIterator& tIter, 331 UErrorCode& status) const override; 332 333 /** 334 * Compares two UTF-8 strings using the Collator. 335 * Returns whether the first one compares less than/equal to/greater than 336 * the second one. 337 * This version takes UTF-8 input. 338 * Note that a StringPiece can be implicitly constructed 339 * from a std::string or a NUL-terminated const char * string. 340 * @param source the first UTF-8 string 341 * @param target the second UTF-8 string 342 * @param status ICU status 343 * @return UCOL_LESS, UCOL_EQUAL or UCOL_GREATER 344 * @stable ICU 51 345 */ 346 U_I18N_API virtual UCollationResult compareUTF8(const StringPiece& source, 347 const StringPiece& target, 348 UErrorCode& status) const override; 349 350 /** 351 * Transforms the string into a series of characters 352 * that can be compared with CollationKey.compare(). 353 * 354 * Note that sort keys are often less efficient than simply doing comparison. 355 * For more details, see the ICU User Guide. 356 * 357 * @param source the source string. 358 * @param key the transformed key of the source string. 359 * @param status the error code status. 360 * @return the transformed key. 361 * @see CollationKey 362 * @stable ICU 2.0 363 */ 364 U_I18N_API virtual CollationKey& getCollationKey(const UnicodeString& source, 365 CollationKey& key, 366 UErrorCode& status) const override; 367 368 /** 369 * Transforms a specified region of the string into a series of characters 370 * that can be compared with CollationKey.compare. 371 * 372 * Note that sort keys are often less efficient than simply doing comparison. 373 * For more details, see the ICU User Guide. 374 * 375 * @param source the source string. 376 * @param sourceLength the length of the source string. 377 * @param key the transformed key of the source string. 378 * @param status the error code status. 379 * @return the transformed key. 380 * @see CollationKey 381 * @stable ICU 2.0 382 */ 383 U_I18N_API virtual CollationKey& getCollationKey(const char16_t* source, 384 int32_t sourceLength, 385 CollationKey& key, 386 UErrorCode& status) const override; 387 388 /** 389 * Generates the hash code for the rule-based collation object. 390 * @return the hash code. 391 * @stable ICU 2.0 392 */ 393 U_I18N_API virtual int32_t hashCode() const override; 394 395 #ifndef U_FORCE_HIDE_DEPRECATED_API 396 /** 397 * Gets the locale of the Collator 398 * @param type can be either requested, valid or actual locale. For more 399 * information see the definition of ULocDataLocaleType in 400 * uloc.h 401 * @param status the error code status. 402 * @return locale where the collation data lives. If the collator 403 * was instantiated from rules, locale is empty. 404 * @deprecated ICU 2.8 likely to change in ICU 3.0, based on feedback 405 */ 406 U_I18N_API virtual Locale getLocale(ULocDataLocaleType type, UErrorCode& status) const override; 407 #endif // U_FORCE_HIDE_DEPRECATED_API 408 409 /** 410 * Gets the tailoring rules for this collator. 411 * @return the collation tailoring from which this collator was created 412 * @stable ICU 2.0 413 */ 414 U_I18N_API const UnicodeString& getRules() const; 415 416 /** 417 * Gets the version information for a Collator. 418 * @param info the version # information, the result will be filled in 419 * @stable ICU 2.0 420 */ 421 U_I18N_API virtual void getVersion(UVersionInfo info) const override; 422 423 #ifndef U_HIDE_DEPRECATED_API 424 /** 425 * Returns the maximum length of any expansion sequences that end with the 426 * specified comparison order. 427 * 428 * This is specific to the kind of collation element values and sequences 429 * returned by the CollationElementIterator. 430 * Call CollationElementIterator::getMaxExpansion() instead. 431 * 432 * @param order a collation order returned by CollationElementIterator::previous 433 * or CollationElementIterator::next. 434 * @return maximum size of the expansion sequences ending with the collation 435 * element, or 1 if the collation element does not occur at the end of 436 * any expansion sequence 437 * @see CollationElementIterator#getMaxExpansion 438 * @deprecated ICU 51 Use CollationElementIterator::getMaxExpansion() instead. 439 */ 440 U_I18N_API int32_t getMaxExpansion(int32_t order) const; 441 #endif /* U_HIDE_DEPRECATED_API */ 442 443 /** 444 * Returns a unique class ID POLYMORPHICALLY. Pure virtual override. This 445 * method is to implement a simple version of RTTI, since not all C++ 446 * compilers support genuine RTTI. Polymorphic operator==() and clone() 447 * methods call this method. 448 * @return The class ID for this object. All objects of a given class have 449 * the same class ID. Objects of other classes have different class 450 * IDs. 451 * @stable ICU 2.0 452 */ 453 U_I18N_API virtual UClassID getDynamicClassID() const override; 454 455 /** 456 * Returns the class ID for this class. This is useful only for comparing to 457 * a return value from getDynamicClassID(). For example: 458 * <pre> 459 * Base* polymorphic_pointer = createPolymorphicObject(); 460 * if (polymorphic_pointer->getDynamicClassID() == 461 * Derived::getStaticClassID()) ... 462 * </pre> 463 * @return The class ID for all objects of this class. 464 * @stable ICU 2.0 465 */ 466 U_I18N_API static UClassID getStaticClassID(); 467 468 #ifndef U_HIDE_DEPRECATED_API 469 /** 470 * Do not use this method: The caller and the ICU library might use different heaps. 471 * Use cloneBinary() instead which writes to caller-provided memory. 472 * 473 * Returns a binary format of this collator. 474 * @param length Returns the length of the data, in bytes 475 * @param status the error code status. 476 * @return memory, owned by the caller, of size 'length' bytes. 477 * @deprecated ICU 52. Use cloneBinary() instead. 478 */ 479 U_I18N_API uint8_t* cloneRuleData(int32_t& length, UErrorCode& status) const; 480 #endif /* U_HIDE_DEPRECATED_API */ 481 482 /** Creates a binary image of a collator. This binary image can be stored and 483 * later used to instantiate a collator using ucol_openBinary. 484 * This API supports preflighting. 485 * @param buffer a fill-in buffer to receive the binary image 486 * @param capacity capacity of the destination buffer 487 * @param status for catching errors 488 * @return size of the image 489 * @see ucol_openBinary 490 * @stable ICU 3.4 491 */ 492 U_I18N_API int32_t cloneBinary(uint8_t* buffer, int32_t capacity, UErrorCode& status) const; 493 494 /** 495 * Returns current rules. Delta defines whether full rules are returned or 496 * just the tailoring. 497 * 498 * getRules(void) should normally be used instead. 499 * See https://unicode-org.github.io/icu/userguide/collation/customization#building-on-existing-locales 500 * @param delta one of UCOL_TAILORING_ONLY, UCOL_FULL_RULES. 501 * @param buffer UnicodeString to store the result rules 502 * @stable ICU 2.2 503 * @see UCOL_FULL_RULES 504 */ 505 U_I18N_API void getRules(UColRuleOption delta, UnicodeString& buffer) const; 506 507 /** 508 * Universal attribute setter 509 * @param attr attribute type 510 * @param value attribute value 511 * @param status to indicate whether the operation went on smoothly or there were errors 512 * @stable ICU 2.2 513 */ 514 U_I18N_API virtual void setAttribute(UColAttribute attr, 515 UColAttributeValue value, 516 UErrorCode& status) override; 517 518 /** 519 * Universal attribute getter. 520 * @param attr attribute type 521 * @param status to indicate whether the operation went on smoothly or there were errors 522 * @return attribute value 523 * @stable ICU 2.2 524 */ 525 U_I18N_API virtual UColAttributeValue getAttribute(UColAttribute attr, 526 UErrorCode& status) const override; 527 528 /** 529 * Sets the variable top to the top of the specified reordering group. 530 * The variable top determines the highest-sorting character 531 * which is affected by UCOL_ALTERNATE_HANDLING. 532 * If that attribute is set to UCOL_NON_IGNORABLE, then the variable top has no effect. 533 * @param group one of UCOL_REORDER_CODE_SPACE, UCOL_REORDER_CODE_PUNCTUATION, 534 * UCOL_REORDER_CODE_SYMBOL, UCOL_REORDER_CODE_CURRENCY; 535 * or UCOL_REORDER_CODE_DEFAULT to restore the default max variable group 536 * @param errorCode Standard ICU error code. Its input value must 537 * pass the U_SUCCESS() test, or else the function returns 538 * immediately. Check for U_FAILURE() on output or use with 539 * function chaining. (See User Guide for details.) 540 * @return *this 541 * @see getMaxVariable 542 * @stable ICU 53 543 */ 544 U_I18N_API virtual Collator& setMaxVariable(UColReorderCode group, UErrorCode& errorCode) override; 545 546 /** 547 * Returns the maximum reordering group whose characters are affected by UCOL_ALTERNATE_HANDLING. 548 * @return the maximum variable reordering group. 549 * @see setMaxVariable 550 * @stable ICU 53 551 */ 552 U_I18N_API virtual UColReorderCode getMaxVariable() const override; 553 554 #ifndef U_FORCE_HIDE_DEPRECATED_API 555 /** 556 * Sets the variable top to the primary weight of the specified string. 557 * 558 * Beginning with ICU 53, the variable top is pinned to 559 * the top of one of the supported reordering groups, 560 * and it must not be beyond the last of those groups. 561 * See setMaxVariable(). 562 * @param varTop one or more (if contraction) char16_ts to which the variable top should be set 563 * @param len length of variable top string. If -1 it is considered to be zero terminated. 564 * @param status error code. If error code is set, the return value is undefined. Errors set by this function are: <br> 565 * U_CE_NOT_FOUND_ERROR if more than one character was passed and there is no such contraction<br> 566 * U_ILLEGAL_ARGUMENT_ERROR if the variable top is beyond 567 * the last reordering group supported by setMaxVariable() 568 * @return variable top primary weight 569 * @deprecated ICU 53 Call setMaxVariable() instead. 570 */ 571 U_I18N_API virtual uint32_t setVariableTop(const char16_t* varTop, 572 int32_t len, 573 UErrorCode& status) override; 574 575 /** 576 * Sets the variable top to the primary weight of the specified string. 577 * 578 * Beginning with ICU 53, the variable top is pinned to 579 * the top of one of the supported reordering groups, 580 * and it must not be beyond the last of those groups. 581 * See setMaxVariable(). 582 * @param varTop a UnicodeString size 1 or more (if contraction) of char16_ts to which the variable top should be set 583 * @param status error code. If error code is set, the return value is undefined. Errors set by this function are: <br> 584 * U_CE_NOT_FOUND_ERROR if more than one character was passed and there is no such contraction<br> 585 * U_ILLEGAL_ARGUMENT_ERROR if the variable top is beyond 586 * the last reordering group supported by setMaxVariable() 587 * @return variable top primary weight 588 * @deprecated ICU 53 Call setMaxVariable() instead. 589 */ 590 U_I18N_API virtual uint32_t setVariableTop(const UnicodeString& varTop, UErrorCode& status) override; 591 592 /** 593 * Sets the variable top to the specified primary weight. 594 * 595 * Beginning with ICU 53, the variable top is pinned to 596 * the top of one of the supported reordering groups, 597 * and it must not be beyond the last of those groups. 598 * See setMaxVariable(). 599 * @param varTop primary weight, as returned by setVariableTop or ucol_getVariableTop 600 * @param status error code 601 * @deprecated ICU 53 Call setMaxVariable() instead. 602 */ 603 U_I18N_API virtual void setVariableTop(uint32_t varTop, UErrorCode& status) override; 604 #endif // U_FORCE_HIDE_DEPRECATED_API 605 606 /** 607 * Gets the variable top value of a Collator. 608 * @param status error code (not changed by function). If error code is set, the return value is undefined. 609 * @return the variable top primary weight 610 * @see getMaxVariable 611 * @stable ICU 2.0 612 */ 613 U_I18N_API virtual uint32_t getVariableTop(UErrorCode& status) const override; 614 615 /** 616 * Get a UnicodeSet that contains all the characters and sequences tailored in 617 * this collator. 618 * @param status error code of the operation 619 * @return a pointer to a UnicodeSet object containing all the 620 * code points and sequences that may sort differently than 621 * in the root collator. The object must be disposed of by using delete 622 * @stable ICU 2.4 623 */ 624 U_I18N_API virtual UnicodeSet* getTailoredSet(UErrorCode& status) const override; 625 626 /** 627 * Get the sort key as an array of bytes from a UnicodeString. 628 * 629 * Note that sort keys are often less efficient than simply doing comparison. 630 * For more details, see the ICU User Guide. 631 * 632 * @param source string to be processed. 633 * @param result buffer to store result in. If nullptr, number of bytes needed 634 * will be returned. 635 * @param resultLength length of the result buffer. If if not enough the 636 * buffer will be filled to capacity. 637 * @return Number of bytes needed for storing the sort key 638 * @stable ICU 2.0 639 */ 640 U_I18N_API virtual int32_t getSortKey(const UnicodeString& source, 641 uint8_t* result, 642 int32_t resultLength) const override; 643 644 /** 645 * Get the sort key as an array of bytes from a char16_t buffer. 646 * 647 * Note that sort keys are often less efficient than simply doing comparison. 648 * For more details, see the ICU User Guide. 649 * 650 * @param source string to be processed. 651 * @param sourceLength length of string to be processed. If -1, the string 652 * is 0 terminated and length will be decided by the function. 653 * @param result buffer to store result in. If nullptr, number of bytes needed 654 * will be returned. 655 * @param resultLength length of the result buffer. If if not enough the 656 * buffer will be filled to capacity. 657 * @return Number of bytes needed for storing the sort key 658 * @stable ICU 2.2 659 */ 660 U_I18N_API virtual int32_t getSortKey(const char16_t* source, 661 int32_t sourceLength, 662 uint8_t* result, 663 int32_t resultLength) const override; 664 665 /** 666 * Retrieves the reordering codes for this collator. 667 * @param dest The array to fill with the script ordering. 668 * @param destCapacity The length of dest. If it is 0, then dest may be nullptr and the function 669 * will only return the length of the result without writing any codes (pre-flighting). 670 * @param status A reference to an error code value, which must not indicate 671 * a failure before the function call. 672 * @return The length of the script ordering array. 673 * @see ucol_setReorderCodes 674 * @see Collator#getEquivalentReorderCodes 675 * @see Collator#setReorderCodes 676 * @stable ICU 4.8 677 */ 678 U_I18N_API virtual int32_t getReorderCodes(int32_t* dest, 679 int32_t destCapacity, 680 UErrorCode& status) const override; 681 682 /** 683 * Sets the ordering of scripts for this collator. 684 * @param reorderCodes An array of script codes in the new order. This can be nullptr if the 685 * length is also set to 0. An empty array will clear any reordering codes on the collator. 686 * @param reorderCodesLength The length of reorderCodes. 687 * @param status error code 688 * @see ucol_setReorderCodes 689 * @see Collator#getReorderCodes 690 * @see Collator#getEquivalentReorderCodes 691 * @stable ICU 4.8 692 */ 693 U_I18N_API virtual void setReorderCodes(const int32_t* reorderCodes, 694 int32_t reorderCodesLength, 695 UErrorCode& status) override; 696 697 /** 698 * Implements ucol_strcollUTF8(). 699 * @internal 700 */ 701 U_I18N_API virtual UCollationResult internalCompareUTF8(const char* left, int32_t leftLength, 702 const char* right, int32_t rightLength, 703 UErrorCode& errorCode) const override; 704 705 /** Get the short definition string for a collator. This internal API harvests the collator's 706 * locale and the attribute set and produces a string that can be used for opening 707 * a collator with the same attributes using the ucol_openFromShortString API. 708 * This string will be normalized. 709 * The structure and the syntax of the string is defined in the "Naming collators" 710 * section of the users guide: 711 * https://unicode-org.github.io/icu/userguide/collation/concepts#collator-naming-scheme 712 * This function supports preflighting. 713 * 714 * This is internal, and intended to be used with delegate converters. 715 * 716 * @param locale a locale that will appear as a collators locale in the resulting 717 * short string definition. If nullptr, the locale will be harvested 718 * from the collator. 719 * @param buffer space to hold the resulting string 720 * @param capacity capacity of the buffer 721 * @param status for returning errors. All the preflighting errors are featured 722 * @return length of the resulting string 723 * @see ucol_openFromShortString 724 * @see ucol_normalizeShortDefinitionString 725 * @see ucol_getShortDefinitionString 726 * @internal 727 */ 728 U_I18N_API virtual int32_t internalGetShortDefinitionString(const char* locale, 729 char* buffer, 730 int32_t capacity, 731 UErrorCode& status) const override; 732 733 /** 734 * Implements ucol_nextSortKeyPart(). 735 * @internal 736 */ 737 U_I18N_API virtual int32_t internalNextSortKeyPart(UCharIterator* iter, 738 uint32_t state[2], 739 uint8_t* dest, 740 int32_t count, 741 UErrorCode& errorCode) const override; 742 743 // Do not enclose the default constructor with #ifndef U_HIDE_INTERNAL_API 744 /** 745 * Only for use in ucol_openRules(). 746 * @internal 747 */ 748 U_I18N_API RuleBasedCollator(); 749 750 #ifndef U_HIDE_INTERNAL_API 751 /** 752 * Implements ucol_getLocaleByType(). 753 * Needed because the lifetime of the locale ID string must match that of the collator. 754 * getLocale() returns a copy of a Locale, with minimal lifetime in a C wrapper. 755 * @internal 756 */ 757 U_I18N_API const char* internalGetLocaleID(ULocDataLocaleType type, UErrorCode& errorCode) const; 758 759 /** 760 * Implements ucol_getContractionsAndExpansions(). 761 * Gets this collator's sets of contraction strings and/or 762 * characters and strings that map to multiple collation elements (expansions). 763 * If addPrefixes is true, then contractions that are expressed as 764 * prefix/pre-context rules are included. 765 * @param contractions if not nullptr, the set to hold the contractions 766 * @param expansions if not nullptr, the set to hold the expansions 767 * @param addPrefixes include prefix contextual mappings 768 * @param errorCode in/out ICU error code 769 * @internal 770 */ 771 U_I18N_API void internalGetContractionsAndExpansions(UnicodeSet* contractions, 772 UnicodeSet* expansions, 773 UBool addPrefixes, 774 UErrorCode& errorCode) const; 775 776 /** 777 * Adds the contractions that start with character c to the set. 778 * Ignores prefixes. Used by AlphabeticIndex. 779 * @internal 780 */ 781 U_I18N_API void internalAddContractions(UChar32 c, UnicodeSet& set, UErrorCode& errorCode) const; 782 783 /** 784 * Implements from-rule constructors, and ucol_openRules(). 785 * @internal 786 */ 787 U_I18N_API void internalBuildTailoring(const UnicodeString& rules, 788 int32_t strength, 789 UColAttributeValue decompositionMode, 790 UParseError* outParseError, 791 UnicodeString* outReason, 792 UErrorCode& errorCode); 793 794 /** @internal */ 795 static inline RuleBasedCollator *rbcFromUCollator(UCollator *uc) { 796 return dynamic_cast<RuleBasedCollator *>(fromUCollator(uc)); 797 } 798 /** @internal */ 799 static inline const RuleBasedCollator *rbcFromUCollator(const UCollator *uc) { 800 return dynamic_cast<const RuleBasedCollator *>(fromUCollator(uc)); 801 } 802 803 /** 804 * Appends the CEs for the string to the vector. 805 * @internal for tests & tools 806 */ 807 U_I18N_API void internalGetCEs(const UnicodeString& str, 808 UVector64& ces, 809 UErrorCode& errorCode) const; 810 #endif // U_HIDE_INTERNAL_API 811 812 protected: 813 /** 814 * Used internally by registration to define the requested and valid locales. 815 * @param requestedLocale the requested locale 816 * @param validLocale the valid locale 817 * @param actualLocale the actual locale 818 * @internal 819 */ 820 virtual void setLocales(const Locale& requestedLocale, const Locale& validLocale, const Locale& actualLocale) override; 821 822 private: 823 friend class CollationElementIterator; 824 friend class Collator; 825 826 RuleBasedCollator(const CollationCacheEntry *entry); 827 828 /** 829 * Enumeration of attributes that are relevant for short definition strings 830 * (e.g., ucol_getShortDefinitionString()). 831 * Effectively extends UColAttribute. 832 */ 833 enum Attributes { 834 ATTR_VARIABLE_TOP = UCOL_ATTRIBUTE_COUNT, 835 ATTR_LIMIT 836 }; 837 838 void adoptTailoring(CollationTailoring *t, UErrorCode &errorCode); 839 840 // Both lengths must be <0 or else both must be >=0. 841 UCollationResult doCompare(const char16_t *left, int32_t leftLength, 842 const char16_t *right, int32_t rightLength, 843 UErrorCode &errorCode) const; 844 UCollationResult doCompare(const uint8_t *left, int32_t leftLength, 845 const uint8_t *right, int32_t rightLength, 846 UErrorCode &errorCode) const; 847 848 void writeSortKey(const char16_t *s, int32_t length, 849 SortKeyByteSink &sink, UErrorCode &errorCode) const; 850 851 void writeIdenticalLevel(const char16_t *s, const char16_t *limit, 852 SortKeyByteSink &sink, UErrorCode &errorCode) const; 853 854 const CollationSettings &getDefaultSettings() const; 855 856 void setAttributeDefault(int32_t attribute) { 857 explicitlySetAttributes &= ~(static_cast<uint32_t>(1) << attribute); 858 } 859 void setAttributeExplicitly(int32_t attribute) { 860 explicitlySetAttributes |= static_cast<uint32_t>(1) << attribute; 861 } 862 UBool attributeHasBeenSetExplicitly(int32_t attribute) const { 863 // assert(0 <= attribute < ATTR_LIMIT); 864 return (explicitlySetAttributes & (static_cast<uint32_t>(1) << attribute)) != 0; 865 } 866 867 /** 868 * Tests whether a character is "unsafe" for use as a collation starting point. 869 * 870 * @param c code point or code unit 871 * @return true if c is unsafe 872 * @see CollationElementIterator#setOffset(int) 873 */ 874 UBool isUnsafe(UChar32 c) const; 875 876 static void U_CALLCONV computeMaxExpansions(const CollationTailoring *t, UErrorCode &errorCode); 877 UBool initMaxExpansions(UErrorCode &errorCode) const; 878 879 void setFastLatinOptions(CollationSettings &ownedSettings) const; 880 881 const CollationData *data; 882 const CollationSettings *settings; // reference-counted 883 const CollationTailoring *tailoring; // alias of cacheEntry->tailoring 884 const CollationCacheEntry *cacheEntry; // reference-counted 885 Locale validLocale; 886 uint32_t explicitlySetAttributes; 887 888 UBool actualLocaleIsSameAsValid; 889 }; 890 891 U_NAMESPACE_END 892 893 #endif // !UCONFIG_NO_COLLATION 894 895 #endif /* U_SHOW_CPLUSPLUS_API */ 896 897 #endif // TBLCOLL_H