normlzr.h (31532B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************** 5 * COPYRIGHT: 6 * Copyright (c) 1996-2015, International Business Machines Corporation and 7 * others. All Rights Reserved. 8 ******************************************************************** 9 */ 10 11 #ifndef NORMLZR_H 12 #define NORMLZR_H 13 14 #include "unicode/utypes.h" 15 16 #if U_SHOW_CPLUSPLUS_API 17 18 /** 19 * \file 20 * \brief C++ API: Unicode Normalization 21 */ 22 23 #if !UCONFIG_NO_NORMALIZATION 24 25 #include "unicode/chariter.h" 26 #include "unicode/normalizer2.h" 27 #include "unicode/unistr.h" 28 #include "unicode/unorm.h" 29 #include "unicode/uobject.h" 30 31 U_NAMESPACE_BEGIN 32 /** 33 * Old Unicode normalization API. 34 * 35 * This API has been replaced by the Normalizer2 class and is only available 36 * for backward compatibility. This class simply delegates to the Normalizer2 class. 37 * There is one exception: The new API does not provide a replacement for Normalizer::compare(). 38 * 39 * The Normalizer class supports the standard normalization forms described in 40 * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode"> 41 * Unicode Standard Annex #15: Unicode Normalization Forms</a>. 42 * 43 * The Normalizer class consists of two parts: 44 * - static functions that normalize strings or test if strings are normalized 45 * - a Normalizer object is an iterator that takes any kind of text and 46 * provides iteration over its normalized form 47 * 48 * The Normalizer class is not suitable for subclassing. 49 * 50 * For basic information about normalization forms and details about the C API 51 * please see the documentation in unorm.h. 52 * 53 * The iterator API with the Normalizer constructors and the non-static functions 54 * use a CharacterIterator as input. It is possible to pass a string which 55 * is then internally wrapped in a CharacterIterator. 56 * The input text is not normalized all at once, but incrementally where needed 57 * (providing efficient random access). 58 * This allows to pass in a large text but spend only a small amount of time 59 * normalizing a small part of that text. 60 * However, if the entire text is normalized, then the iterator will be 61 * slower than normalizing the entire text at once and iterating over the result. 62 * A possible use of the Normalizer iterator is also to report an index into the 63 * original text that is close to where the normalized characters come from. 64 * 65 * <em>Important:</em> The iterator API was cleaned up significantly for ICU 2.0. 66 * The earlier implementation reported the getIndex() inconsistently, 67 * and previous() could not be used after setIndex(), next(), first(), and current(). 68 * 69 * Normalizer allows to start normalizing from anywhere in the input text by 70 * calling setIndexOnly(), first(), or last(). 71 * Without calling any of these, the iterator will start at the beginning of the text. 72 * 73 * At any time, next() returns the next normalized code point (UChar32), 74 * with post-increment semantics (like CharacterIterator::next32PostInc()). 75 * previous() returns the previous normalized code point (UChar32), 76 * with pre-decrement semantics (like CharacterIterator::previous32()). 77 * 78 * current() returns the current code point 79 * (respectively the one at the newly set index) without moving 80 * the getIndex(). Note that if the text at the current position 81 * needs to be normalized, then these functions will do that. 82 * (This is why current() is not const.) 83 * It is more efficient to call setIndexOnly() instead, which does not 84 * normalize. 85 * 86 * getIndex() always refers to the position in the input text where the normalized 87 * code points are returned from. It does not always change with each returned 88 * code point. 89 * The code point that is returned from any of the functions 90 * corresponds to text at or after getIndex(), according to the 91 * function's iteration semantics (post-increment or pre-decrement). 92 * 93 * next() returns a code point from at or after the getIndex() 94 * from before the next() call. After the next() call, the getIndex() 95 * might have moved to where the next code point will be returned from 96 * (from a next() or current() call). 97 * This is semantically equivalent to array access with array[index++] 98 * (post-increment semantics). 99 * 100 * previous() returns a code point from at or after the getIndex() 101 * from after the previous() call. 102 * This is semantically equivalent to array access with array[--index] 103 * (pre-decrement semantics). 104 * 105 * Internally, the Normalizer iterator normalizes a small piece of text 106 * starting at the getIndex() and ending at a following "safe" index. 107 * The normalized results is stored in an internal string buffer, and 108 * the code points are iterated from there. 109 * With multiple iteration calls, this is repeated until the next piece 110 * of text needs to be normalized, and the getIndex() needs to be moved. 111 * 112 * The following "safe" index, the internal buffer, and the secondary 113 * iteration index into that buffer are not exposed on the API. 114 * This also means that it is currently not practical to return to 115 * a particular, arbitrary position in the text because one would need to 116 * know, and be able to set, in addition to the getIndex(), at least also the 117 * current index into the internal buffer. 118 * It is currently only possible to observe when getIndex() changes 119 * (with careful consideration of the iteration semantics), 120 * at which time the internal index will be 0. 121 * For example, if getIndex() is different after next() than before it, 122 * then the internal index is 0 and one can return to this getIndex() 123 * later with setIndexOnly(). 124 * 125 * Note: While the setIndex() and getIndex() refer to indices in the 126 * underlying Unicode input text, the next() and previous() methods 127 * iterate through characters in the normalized output. 128 * This means that there is not necessarily a one-to-one correspondence 129 * between characters returned by next() and previous() and the indices 130 * passed to and returned from setIndex() and getIndex(). 131 * It is for this reason that Normalizer does not implement the CharacterIterator interface. 132 * 133 * @author Laura Werner, Mark Davis, Markus Scherer 134 * @stable ICU 2.0 135 */ 136 class U_COMMON_API Normalizer : public UObject { 137 public: 138 #ifndef U_HIDE_DEPRECATED_API 139 /** 140 * If DONE is returned from an iteration function that returns a code point, 141 * then there are no more normalization results available. 142 * @deprecated ICU 56 Use Normalizer2 instead. 143 */ 144 enum { 145 DONE=0xffff 146 }; 147 148 // Constructors 149 150 /** 151 * Creates a new <code>Normalizer</code> object for iterating over the 152 * normalized form of a given string. 153 * <p> 154 * @param str The string to be normalized. The normalization 155 * will start at the beginning of the string. 156 * 157 * @param mode The normalization mode. 158 * @deprecated ICU 56 Use Normalizer2 instead. 159 */ 160 Normalizer(const UnicodeString& str, UNormalizationMode mode); 161 162 /** 163 * Creates a new <code>Normalizer</code> object for iterating over the 164 * normalized form of a given string. 165 * <p> 166 * @param str The string to be normalized. The normalization 167 * will start at the beginning of the string. 168 * 169 * @param length Length of the string, or -1 if NUL-terminated. 170 * @param mode The normalization mode. 171 * @deprecated ICU 56 Use Normalizer2 instead. 172 */ 173 Normalizer(ConstChar16Ptr str, int32_t length, UNormalizationMode mode); 174 175 /** 176 * Creates a new <code>Normalizer</code> object for iterating over the 177 * normalized form of the given text. 178 * <p> 179 * @param iter The input text to be normalized. The normalization 180 * will start at the beginning of the string. 181 * 182 * @param mode The normalization mode. 183 * @deprecated ICU 56 Use Normalizer2 instead. 184 */ 185 Normalizer(const CharacterIterator& iter, UNormalizationMode mode); 186 #endif /* U_HIDE_DEPRECATED_API */ 187 188 #ifndef U_FORCE_HIDE_DEPRECATED_API 189 /** 190 * Copy constructor. 191 * @param copy The object to be copied. 192 * @deprecated ICU 56 Use Normalizer2 instead. 193 */ 194 Normalizer(const Normalizer& copy); 195 196 /** 197 * Destructor 198 * @deprecated ICU 56 Use Normalizer2 instead. 199 */ 200 virtual ~Normalizer(); 201 #endif // U_FORCE_HIDE_DEPRECATED_API 202 203 //------------------------------------------------------------------------- 204 // Static utility methods 205 //------------------------------------------------------------------------- 206 207 #ifndef U_HIDE_DEPRECATED_API 208 /** 209 * Normalizes a <code>UnicodeString</code> according to the specified normalization mode. 210 * This is a wrapper for unorm_normalize(), using UnicodeString's. 211 * 212 * The <code>options</code> parameter specifies which optional 213 * <code>Normalizer</code> features are to be enabled for this operation. 214 * 215 * @param source the input string to be normalized. 216 * @param mode the normalization mode 217 * @param options the optional features to be enabled (0 for no options) 218 * @param result The normalized string (on output). 219 * @param status The error code. 220 * @deprecated ICU 56 Use Normalizer2 instead. 221 */ 222 static void U_EXPORT2 normalize(const UnicodeString& source, 223 UNormalizationMode mode, int32_t options, 224 UnicodeString& result, 225 UErrorCode &status); 226 227 /** 228 * Compose a <code>UnicodeString</code>. 229 * This is equivalent to normalize() with mode UNORM_NFC or UNORM_NFKC. 230 * This is a wrapper for unorm_normalize(), using UnicodeString's. 231 * 232 * The <code>options</code> parameter specifies which optional 233 * <code>Normalizer</code> features are to be enabled for this operation. 234 * 235 * @param source the string to be composed. 236 * @param compat Perform compatibility decomposition before composition. 237 * If this argument is <code>false</code>, only canonical 238 * decomposition will be performed. 239 * @param options the optional features to be enabled (0 for no options) 240 * @param result The composed string (on output). 241 * @param status The error code. 242 * @deprecated ICU 56 Use Normalizer2 instead. 243 */ 244 static void U_EXPORT2 compose(const UnicodeString& source, 245 UBool compat, int32_t options, 246 UnicodeString& result, 247 UErrorCode &status); 248 249 /** 250 * Static method to decompose a <code>UnicodeString</code>. 251 * This is equivalent to normalize() with mode UNORM_NFD or UNORM_NFKD. 252 * This is a wrapper for unorm_normalize(), using UnicodeString's. 253 * 254 * The <code>options</code> parameter specifies which optional 255 * <code>Normalizer</code> features are to be enabled for this operation. 256 * 257 * @param source the string to be decomposed. 258 * @param compat Perform compatibility decomposition. 259 * If this argument is <code>false</code>, only canonical 260 * decomposition will be performed. 261 * @param options the optional features to be enabled (0 for no options) 262 * @param result The decomposed string (on output). 263 * @param status The error code. 264 * @deprecated ICU 56 Use Normalizer2 instead. 265 */ 266 static void U_EXPORT2 decompose(const UnicodeString& source, 267 UBool compat, int32_t options, 268 UnicodeString& result, 269 UErrorCode &status); 270 271 /** 272 * Performing quick check on a string, to quickly determine if the string is 273 * in a particular normalization format. 274 * This is a wrapper for unorm_quickCheck(), using a UnicodeString. 275 * 276 * Three types of result can be returned UNORM_YES, UNORM_NO or 277 * UNORM_MAYBE. Result UNORM_YES indicates that the argument 278 * string is in the desired normalized format, UNORM_NO determines that 279 * argument string is not in the desired normalized format. A 280 * UNORM_MAYBE result indicates that a more thorough check is required, 281 * the user may have to put the string in its normalized form and compare the 282 * results. 283 * @param source string for determining if it is in a normalized format 284 * @param mode normalization format 285 * @param status A reference to a UErrorCode to receive any errors 286 * @return UNORM_YES, UNORM_NO or UNORM_MAYBE 287 * 288 * @see isNormalized 289 * @deprecated ICU 56 Use Normalizer2 instead. 290 */ 291 static inline UNormalizationCheckResult 292 quickCheck(const UnicodeString &source, UNormalizationMode mode, UErrorCode &status); 293 294 /** 295 * Performing quick check on a string; same as the other version of quickCheck 296 * but takes an extra options parameter like most normalization functions. 297 * 298 * @param source string for determining if it is in a normalized format 299 * @param mode normalization format 300 * @param options the optional features to be enabled (0 for no options) 301 * @param status A reference to a UErrorCode to receive any errors 302 * @return UNORM_YES, UNORM_NO or UNORM_MAYBE 303 * 304 * @see isNormalized 305 * @deprecated ICU 56 Use Normalizer2 instead. 306 */ 307 static UNormalizationCheckResult 308 quickCheck(const UnicodeString &source, UNormalizationMode mode, int32_t options, UErrorCode &status); 309 310 /** 311 * Test if a string is in a given normalization form. 312 * This is semantically equivalent to source.equals(normalize(source, mode)) . 313 * 314 * Unlike unorm_quickCheck(), this function returns a definitive result, 315 * never a "maybe". 316 * For NFD, NFKD, and FCD, both functions work exactly the same. 317 * For NFC and NFKC where quickCheck may return "maybe", this function will 318 * perform further tests to arrive at a true/false result. 319 * 320 * @param src String that is to be tested if it is in a normalization format. 321 * @param mode Which normalization form to test for. 322 * @param errorCode ICU error code in/out parameter. 323 * Must fulfill U_SUCCESS before the function call. 324 * @return Boolean value indicating whether the source string is in the 325 * "mode" normalization form. 326 * 327 * @see quickCheck 328 * @deprecated ICU 56 Use Normalizer2 instead. 329 */ 330 static inline UBool 331 isNormalized(const UnicodeString &src, UNormalizationMode mode, UErrorCode &errorCode); 332 333 /** 334 * Test if a string is in a given normalization form; same as the other version of isNormalized 335 * but takes an extra options parameter like most normalization functions. 336 * 337 * @param src String that is to be tested if it is in a normalization format. 338 * @param mode Which normalization form to test for. 339 * @param options the optional features to be enabled (0 for no options) 340 * @param errorCode ICU error code in/out parameter. 341 * Must fulfill U_SUCCESS before the function call. 342 * @return Boolean value indicating whether the source string is in the 343 * "mode" normalization form. 344 * 345 * @see quickCheck 346 * @deprecated ICU 56 Use Normalizer2 instead. 347 */ 348 static UBool 349 isNormalized(const UnicodeString &src, UNormalizationMode mode, int32_t options, UErrorCode &errorCode); 350 351 /** 352 * Concatenate normalized strings, making sure that the result is normalized as well. 353 * 354 * If both the left and the right strings are in 355 * the normalization form according to "mode/options", 356 * then the result will be 357 * 358 * \code 359 * dest=normalize(left+right, mode, options) 360 * \endcode 361 * 362 * For details see unorm_concatenate in unorm.h. 363 * 364 * @param left Left source string. 365 * @param right Right source string. 366 * @param result The output string. 367 * @param mode The normalization mode. 368 * @param options A bit set of normalization options. 369 * @param errorCode ICU error code in/out parameter. 370 * Must fulfill U_SUCCESS before the function call. 371 * @return result 372 * 373 * @see unorm_concatenate 374 * @see normalize 375 * @see unorm_next 376 * @see unorm_previous 377 * 378 * @deprecated ICU 56 Use Normalizer2 instead. 379 */ 380 static UnicodeString & 381 U_EXPORT2 concatenate(const UnicodeString &left, const UnicodeString &right, 382 UnicodeString &result, 383 UNormalizationMode mode, int32_t options, 384 UErrorCode &errorCode); 385 #endif /* U_HIDE_DEPRECATED_API */ 386 387 /** 388 * Compare two strings for canonical equivalence. 389 * Further options include case-insensitive comparison and 390 * code point order (as opposed to code unit order). 391 * 392 * Canonical equivalence between two strings is defined as their normalized 393 * forms (NFD or NFC) being identical. 394 * This function compares strings incrementally instead of normalizing 395 * (and optionally case-folding) both strings entirely, 396 * improving performance significantly. 397 * 398 * Bulk normalization is only necessary if the strings do not fulfill the FCD 399 * conditions. Only in this case, and only if the strings are relatively long, 400 * is memory allocated temporarily. 401 * For FCD strings and short non-FCD strings there is no memory allocation. 402 * 403 * Semantically, this is equivalent to 404 * strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2))) 405 * where code point order and foldCase are all optional. 406 * 407 * UAX 21 2.5 Caseless Matching specifies that for a canonical caseless match 408 * the case folding must be performed first, then the normalization. 409 * 410 * @param s1 First source string. 411 * @param s2 Second source string. 412 * 413 * @param options A bit set of options: 414 * - U_FOLD_CASE_DEFAULT or 0 is used for default options: 415 * Case-sensitive comparison in code unit order, and the input strings 416 * are quick-checked for FCD. 417 * 418 * - UNORM_INPUT_IS_FCD 419 * Set if the caller knows that both s1 and s2 fulfill the FCD conditions. 420 * If not set, the function will quickCheck for FCD 421 * and normalize if necessary. 422 * 423 * - U_COMPARE_CODE_POINT_ORDER 424 * Set to choose code point order instead of code unit order 425 * (see u_strCompare for details). 426 * 427 * - U_COMPARE_IGNORE_CASE 428 * Set to compare strings case-insensitively using case folding, 429 * instead of case-sensitively. 430 * If set, then the following case folding options are used. 431 * 432 * - Options as used with case-insensitive comparisons, currently: 433 * 434 * - U_FOLD_CASE_EXCLUDE_SPECIAL_I 435 * (see u_strCaseCompare for details) 436 * 437 * - regular normalization options shifted left by UNORM_COMPARE_NORM_OPTIONS_SHIFT 438 * 439 * @param errorCode ICU error code in/out parameter. 440 * Must fulfill U_SUCCESS before the function call. 441 * @return <0 or 0 or >0 as usual for string comparisons 442 * 443 * @see unorm_compare 444 * @see normalize 445 * @see UNORM_FCD 446 * @see u_strCompare 447 * @see u_strCaseCompare 448 * 449 * @stable ICU 2.2 450 */ 451 static inline int32_t 452 compare(const UnicodeString &s1, const UnicodeString &s2, 453 uint32_t options, 454 UErrorCode &errorCode); 455 456 #ifndef U_HIDE_DEPRECATED_API 457 //------------------------------------------------------------------------- 458 // Iteration API 459 //------------------------------------------------------------------------- 460 461 /** 462 * Return the current character in the normalized text. 463 * current() may need to normalize some text at getIndex(). 464 * The getIndex() is not changed. 465 * 466 * @return the current normalized code point 467 * @deprecated ICU 56 Use Normalizer2 instead. 468 */ 469 UChar32 current(); 470 471 /** 472 * Return the first character in the normalized text. 473 * This is equivalent to setIndexOnly(startIndex()) followed by next(). 474 * (Post-increment semantics.) 475 * 476 * @return the first normalized code point 477 * @deprecated ICU 56 Use Normalizer2 instead. 478 */ 479 UChar32 first(); 480 481 /** 482 * Return the last character in the normalized text. 483 * This is equivalent to setIndexOnly(endIndex()) followed by previous(). 484 * (Pre-decrement semantics.) 485 * 486 * @return the last normalized code point 487 * @deprecated ICU 56 Use Normalizer2 instead. 488 */ 489 UChar32 last(); 490 491 /** 492 * Return the next character in the normalized text. 493 * (Post-increment semantics.) 494 * If the end of the text has already been reached, DONE is returned. 495 * The DONE value could be confused with a U+FFFF non-character code point 496 * in the text. If this is possible, you can test getIndex()<endIndex() 497 * before calling next(), or (getIndex()<endIndex() || last()!=DONE) 498 * after calling next(). (Calling last() will change the iterator state!) 499 * 500 * The C API unorm_next() is more efficient and does not have this ambiguity. 501 * 502 * @return the next normalized code point 503 * @deprecated ICU 56 Use Normalizer2 instead. 504 */ 505 UChar32 next(); 506 507 /** 508 * Return the previous character in the normalized text and decrement. 509 * (Pre-decrement semantics.) 510 * If the beginning of the text has already been reached, DONE is returned. 511 * The DONE value could be confused with a U+FFFF non-character code point 512 * in the text. If this is possible, you can test 513 * (getIndex()>startIndex() || first()!=DONE). (Calling first() will change 514 * the iterator state!) 515 * 516 * The C API unorm_previous() is more efficient and does not have this ambiguity. 517 * 518 * @return the previous normalized code point 519 * @deprecated ICU 56 Use Normalizer2 instead. 520 */ 521 UChar32 previous(); 522 523 /** 524 * Set the iteration position in the input text that is being normalized, 525 * without any immediate normalization. 526 * After setIndexOnly(), getIndex() will return the same index that is 527 * specified here. 528 * 529 * @param index the desired index in the input text. 530 * @deprecated ICU 56 Use Normalizer2 instead. 531 */ 532 void setIndexOnly(int32_t index); 533 534 /** 535 * Reset the index to the beginning of the text. 536 * This is equivalent to setIndexOnly(startIndex)). 537 * @deprecated ICU 56 Use Normalizer2 instead. 538 */ 539 void reset(); 540 541 /** 542 * Retrieve the current iteration position in the input text that is 543 * being normalized. 544 * 545 * A following call to next() will return a normalized code point from 546 * the input text at or after this index. 547 * 548 * After a call to previous(), getIndex() will point at or before the 549 * position in the input text where the normalized code point 550 * was returned from with previous(). 551 * 552 * @return the current index in the input text 553 * @deprecated ICU 56 Use Normalizer2 instead. 554 */ 555 int32_t getIndex() const; 556 557 /** 558 * Retrieve the index of the start of the input text. This is the begin index 559 * of the <code>CharacterIterator</code> or the start (i.e. index 0) of the string 560 * over which this <code>Normalizer</code> is iterating. 561 * 562 * @return the smallest index in the input text where the Normalizer operates 563 * @deprecated ICU 56 Use Normalizer2 instead. 564 */ 565 int32_t startIndex() const; 566 567 /** 568 * Retrieve the index of the end of the input text. This is the end index 569 * of the <code>CharacterIterator</code> or the length of the string 570 * over which this <code>Normalizer</code> is iterating. 571 * This end index is exclusive, i.e., the Normalizer operates only on characters 572 * before this index. 573 * 574 * @return the first index in the input text where the Normalizer does not operate 575 * @deprecated ICU 56 Use Normalizer2 instead. 576 */ 577 int32_t endIndex() const; 578 579 /** 580 * Returns true when both iterators refer to the same character in the same 581 * input text. 582 * 583 * @param that a Normalizer object to compare this one to 584 * @return comparison result 585 * @deprecated ICU 56 Use Normalizer2 instead. 586 */ 587 bool operator==(const Normalizer& that) const; 588 589 /** 590 * Returns false when both iterators refer to the same character in the same 591 * input text. 592 * 593 * @param that a Normalizer object to compare this one to 594 * @return comparison result 595 * @deprecated ICU 56 Use Normalizer2 instead. 596 */ 597 inline bool operator!=(const Normalizer& that) const; 598 599 /** 600 * Returns a pointer to a new Normalizer that is a clone of this one. 601 * The caller is responsible for deleting the new clone. 602 * @return a pointer to a new Normalizer 603 * @deprecated ICU 56 Use Normalizer2 instead. 604 */ 605 Normalizer* clone() const; 606 607 /** 608 * Generates a hash code for this iterator. 609 * 610 * @return the hash code 611 * @deprecated ICU 56 Use Normalizer2 instead. 612 */ 613 int32_t hashCode() const; 614 615 //------------------------------------------------------------------------- 616 // Property access methods 617 //------------------------------------------------------------------------- 618 619 /** 620 * Set the normalization mode for this object. 621 * <p> 622 * <b>Note:</b>If the normalization mode is changed while iterating 623 * over a string, calls to {@link #next() } and {@link #previous() } may 624 * return previously buffers characters in the old normalization mode 625 * until the iteration is able to re-sync at the next base character. 626 * It is safest to call {@link #setIndexOnly }, {@link #reset() }, 627 * {@link #setText }, {@link #first() }, 628 * {@link #last() }, etc. after calling <code>setMode</code>. 629 * <p> 630 * @param newMode the new mode for this <code>Normalizer</code>. 631 * @see #getUMode 632 * @deprecated ICU 56 Use Normalizer2 instead. 633 */ 634 void setMode(UNormalizationMode newMode); 635 636 /** 637 * Return the normalization mode for this object. 638 * 639 * This is an unusual name because there used to be a getMode() that 640 * returned a different type. 641 * 642 * @return the mode for this <code>Normalizer</code> 643 * @see #setMode 644 * @deprecated ICU 56 Use Normalizer2 instead. 645 */ 646 UNormalizationMode getUMode() const; 647 648 /** 649 * Set options that affect this <code>Normalizer</code>'s operation. 650 * Options do not change the basic composition or decomposition operation 651 * that is being performed, but they control whether 652 * certain optional portions of the operation are done. 653 * Currently the only available option is obsolete. 654 * 655 * It is possible to specify multiple options that are all turned on or off. 656 * 657 * @param option the option(s) whose value is/are to be set. 658 * @param value the new setting for the option. Use <code>true</code> to 659 * turn the option(s) on and <code>false</code> to turn it/them off. 660 * 661 * @see #getOption 662 * @deprecated ICU 56 Use Normalizer2 instead. 663 */ 664 void setOption(int32_t option, 665 UBool value); 666 667 /** 668 * Determine whether an option is turned on or off. 669 * If multiple options are specified, then the result is true if any 670 * of them are set. 671 * <p> 672 * @param option the option(s) that are to be checked 673 * @return true if any of the option(s) are set 674 * @see #setOption 675 * @deprecated ICU 56 Use Normalizer2 instead. 676 */ 677 UBool getOption(int32_t option) const; 678 679 /** 680 * Set the input text over which this <code>Normalizer</code> will iterate. 681 * The iteration position is set to the beginning. 682 * 683 * @param newText a string that replaces the current input text 684 * @param status a UErrorCode 685 * @deprecated ICU 56 Use Normalizer2 instead. 686 */ 687 void setText(const UnicodeString& newText, 688 UErrorCode &status); 689 690 /** 691 * Set the input text over which this <code>Normalizer</code> will iterate. 692 * The iteration position is set to the beginning. 693 * 694 * @param newText a CharacterIterator object that replaces the current input text 695 * @param status a UErrorCode 696 * @deprecated ICU 56 Use Normalizer2 instead. 697 */ 698 void setText(const CharacterIterator& newText, 699 UErrorCode &status); 700 701 /** 702 * Set the input text over which this <code>Normalizer</code> will iterate. 703 * The iteration position is set to the beginning. 704 * 705 * @param newText a string that replaces the current input text 706 * @param length the length of the string, or -1 if NUL-terminated 707 * @param status a UErrorCode 708 * @deprecated ICU 56 Use Normalizer2 instead. 709 */ 710 void setText(ConstChar16Ptr newText, 711 int32_t length, 712 UErrorCode &status); 713 /** 714 * Copies the input text into the UnicodeString argument. 715 * 716 * @param result Receives a copy of the text under iteration. 717 * @deprecated ICU 56 Use Normalizer2 instead. 718 */ 719 void getText(UnicodeString& result); 720 721 /** 722 * ICU "poor man's RTTI", returns a UClassID for this class. 723 * @returns a UClassID for this class. 724 * @deprecated ICU 56 Use Normalizer2 instead. 725 */ 726 static UClassID U_EXPORT2 getStaticClassID(); 727 #endif /* U_HIDE_DEPRECATED_API */ 728 729 #ifndef U_FORCE_HIDE_DEPRECATED_API 730 /** 731 * ICU "poor man's RTTI", returns a UClassID for the actual class. 732 * @return a UClassID for the actual class. 733 * @deprecated ICU 56 Use Normalizer2 instead. 734 */ 735 virtual UClassID getDynamicClassID() const override; 736 #endif // U_FORCE_HIDE_DEPRECATED_API 737 738 private: 739 //------------------------------------------------------------------------- 740 // Private functions 741 //------------------------------------------------------------------------- 742 743 Normalizer() = delete; // default constructor not implemented 744 Normalizer &operator=(const Normalizer &that) = delete; // assignment operator not implemented 745 746 // Private utility methods for iteration 747 // For documentation, see the source code 748 UBool nextNormalize(); 749 UBool previousNormalize(); 750 751 void init(); 752 void clearBuffer(); 753 754 //------------------------------------------------------------------------- 755 // Private data 756 //------------------------------------------------------------------------- 757 758 FilteredNormalizer2*fFilteredNorm2; // owned if not nullptr 759 const Normalizer2 *fNorm2; // not owned; may be equal to fFilteredNorm2 760 UNormalizationMode fUMode; // deprecated 761 int32_t fOptions; 762 763 // The input text and our position in it 764 CharacterIterator *text; 765 766 // The normalization buffer is the result of normalization 767 // of the source in [currentIndex..nextIndex[ . 768 int32_t currentIndex, nextIndex; 769 770 // A buffer for holding intermediate results 771 UnicodeString buffer; 772 int32_t bufferPos; 773 }; 774 775 //------------------------------------------------------------------------- 776 // Inline implementations 777 //------------------------------------------------------------------------- 778 779 #ifndef U_HIDE_DEPRECATED_API 780 inline bool 781 Normalizer::operator!= (const Normalizer& other) const 782 { return ! operator==(other); } 783 784 inline UNormalizationCheckResult 785 Normalizer::quickCheck(const UnicodeString& source, 786 UNormalizationMode mode, 787 UErrorCode &status) { 788 return quickCheck(source, mode, 0, status); 789 } 790 791 inline UBool 792 Normalizer::isNormalized(const UnicodeString& source, 793 UNormalizationMode mode, 794 UErrorCode &status) { 795 return isNormalized(source, mode, 0, status); 796 } 797 #endif /* U_HIDE_DEPRECATED_API */ 798 799 inline int32_t 800 Normalizer::compare(const UnicodeString &s1, const UnicodeString &s2, 801 uint32_t options, 802 UErrorCode &errorCode) { 803 // all argument checking is done in unorm_compare 804 return unorm_compare(toUCharPtr(s1.getBuffer()), s1.length(), 805 toUCharPtr(s2.getBuffer()), s2.length(), 806 options, 807 &errorCode); 808 } 809 810 U_NAMESPACE_END 811 812 #endif /* #if !UCONFIG_NO_NORMALIZATION */ 813 814 #endif // NORMLZR_H 815 816 #endif /* U_SHOW_CPLUSPLUS_API */