[ tor-browser ].git.dasho

normlzr.h (31532B)
      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 ********************************************************************
      5 * COPYRIGHT:
      6 * Copyright (c) 1996-2015, International Business Machines Corporation and
      7 * others. All Rights Reserved.
      8 ********************************************************************
      9 */
     10 
     11 #ifndef NORMLZR_H
     12 #define NORMLZR_H
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if U_SHOW_CPLUSPLUS_API
     17 
     18 /**
     19 * \file 
     20 * \brief C++ API: Unicode Normalization
     21 */
     22 
     23 #if !UCONFIG_NO_NORMALIZATION
     24 
     25 #include "unicode/chariter.h"
     26 #include "unicode/normalizer2.h"
     27 #include "unicode/unistr.h"
     28 #include "unicode/unorm.h"
     29 #include "unicode/uobject.h"
     30 
     31 U_NAMESPACE_BEGIN
     32 /**
     33 * Old Unicode normalization API.
     34 *
     35 * This API has been replaced by the Normalizer2 class and is only available
     36 * for backward compatibility. This class simply delegates to the Normalizer2 class.
     37 * There is one exception: The new API does not provide a replacement for Normalizer::compare().
     38 *
     39 * The Normalizer class supports the standard normalization forms described in
     40 * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
     41 * Unicode Standard Annex #15: Unicode Normalization Forms</a>.
     42 *
     43 * The Normalizer class consists of two parts:
     44 * - static functions that normalize strings or test if strings are normalized
     45 * - a Normalizer object is an iterator that takes any kind of text and
     46 *   provides iteration over its normalized form
     47 *
     48 * The Normalizer class is not suitable for subclassing.
     49 *
     50 * For basic information about normalization forms and details about the C API
     51 * please see the documentation in unorm.h.
     52 *
     53 * The iterator API with the Normalizer constructors and the non-static functions
     54 * use a CharacterIterator as input. It is possible to pass a string which
     55 * is then internally wrapped in a CharacterIterator.
     56 * The input text is not normalized all at once, but incrementally where needed
     57 * (providing efficient random access).
     58 * This allows to pass in a large text but spend only a small amount of time
     59 * normalizing a small part of that text.
     60 * However, if the entire text is normalized, then the iterator will be
     61 * slower than normalizing the entire text at once and iterating over the result.
     62 * A possible use of the Normalizer iterator is also to report an index into the
     63 * original text that is close to where the normalized characters come from.
     64 *
     65 * <em>Important:</em> The iterator API was cleaned up significantly for ICU 2.0.
     66 * The earlier implementation reported the getIndex() inconsistently,
     67 * and previous() could not be used after setIndex(), next(), first(), and current().
     68 *
     69 * Normalizer allows to start normalizing from anywhere in the input text by
     70 * calling setIndexOnly(), first(), or last().
     71 * Without calling any of these, the iterator will start at the beginning of the text.
     72 *
     73 * At any time, next() returns the next normalized code point (UChar32),
     74 * with post-increment semantics (like CharacterIterator::next32PostInc()).
     75 * previous() returns the previous normalized code point (UChar32),
     76 * with pre-decrement semantics (like CharacterIterator::previous32()).
     77 *
     78 * current() returns the current code point
     79 * (respectively the one at the newly set index) without moving
     80 * the getIndex(). Note that if the text at the current position
     81 * needs to be normalized, then these functions will do that.
     82 * (This is why current() is not const.)
     83 * It is more efficient to call setIndexOnly() instead, which does not
     84 * normalize.
     85 *
     86 * getIndex() always refers to the position in the input text where the normalized
     87 * code points are returned from. It does not always change with each returned
     88 * code point.
     89 * The code point that is returned from any of the functions
     90 * corresponds to text at or after getIndex(), according to the
     91 * function's iteration semantics (post-increment or pre-decrement).
     92 *
     93 * next() returns a code point from at or after the getIndex()
     94 * from before the next() call. After the next() call, the getIndex()
     95 * might have moved to where the next code point will be returned from
     96 * (from a next() or current() call).
     97 * This is semantically equivalent to array access with array[index++]
     98 * (post-increment semantics).
     99 *
    100 * previous() returns a code point from at or after the getIndex()
    101 * from after the previous() call.
    102 * This is semantically equivalent to array access with array[--index]
    103 * (pre-decrement semantics).
    104 *
    105 * Internally, the Normalizer iterator normalizes a small piece of text
    106 * starting at the getIndex() and ending at a following "safe" index.
    107 * The normalized results is stored in an internal string buffer, and
    108 * the code points are iterated from there.
    109 * With multiple iteration calls, this is repeated until the next piece
    110 * of text needs to be normalized, and the getIndex() needs to be moved.
    111 *
    112 * The following "safe" index, the internal buffer, and the secondary
    113 * iteration index into that buffer are not exposed on the API.
    114 * This also means that it is currently not practical to return to
    115 * a particular, arbitrary position in the text because one would need to
    116 * know, and be able to set, in addition to the getIndex(), at least also the
    117 * current index into the internal buffer.
    118 * It is currently only possible to observe when getIndex() changes
    119 * (with careful consideration of the iteration semantics),
    120 * at which time the internal index will be 0.
    121 * For example, if getIndex() is different after next() than before it,
    122 * then the internal index is 0 and one can return to this getIndex()
    123 * later with setIndexOnly().
    124 *
    125 * Note: While the setIndex() and getIndex() refer to indices in the
    126 * underlying Unicode input text, the next() and previous() methods
    127 * iterate through characters in the normalized output.
    128 * This means that there is not necessarily a one-to-one correspondence
    129 * between characters returned by next() and previous() and the indices
    130 * passed to and returned from setIndex() and getIndex().
    131 * It is for this reason that Normalizer does not implement the CharacterIterator interface.
    132 *
    133 * @author Laura Werner, Mark Davis, Markus Scherer
    134 * @stable ICU 2.0
    135 */
    136 class U_COMMON_API Normalizer : public UObject {
    137 public:
    138 #ifndef U_HIDE_DEPRECATED_API
    139  /**
    140   * If DONE is returned from an iteration function that returns a code point,
    141   * then there are no more normalization results available.
    142   * @deprecated ICU 56 Use Normalizer2 instead.
    143   */
    144  enum {
    145      DONE=0xffff
    146  };
    147 
    148  // Constructors
    149 
    150  /**
    151   * Creates a new <code>Normalizer</code> object for iterating over the
    152   * normalized form of a given string.
    153   * <p>
    154   * @param str   The string to be normalized.  The normalization
    155   *              will start at the beginning of the string.
    156   *
    157   * @param mode  The normalization mode.
    158   * @deprecated ICU 56 Use Normalizer2 instead.
    159   */
    160  Normalizer(const UnicodeString& str, UNormalizationMode mode);
    161 
    162  /**
    163   * Creates a new <code>Normalizer</code> object for iterating over the
    164   * normalized form of a given string.
    165   * <p>
    166   * @param str   The string to be normalized.  The normalization
    167   *              will start at the beginning of the string.
    168   *
    169   * @param length Length of the string, or -1 if NUL-terminated.
    170   * @param mode  The normalization mode.
    171   * @deprecated ICU 56 Use Normalizer2 instead.
    172   */
    173  Normalizer(ConstChar16Ptr str, int32_t length, UNormalizationMode mode);
    174 
    175  /**
    176   * Creates a new <code>Normalizer</code> object for iterating over the
    177   * normalized form of the given text.
    178   * <p>
    179   * @param iter  The input text to be normalized.  The normalization
    180   *              will start at the beginning of the string.
    181   *
    182   * @param mode  The normalization mode.
    183   * @deprecated ICU 56 Use Normalizer2 instead.
    184   */
    185  Normalizer(const CharacterIterator& iter, UNormalizationMode mode);
    186 #endif  /* U_HIDE_DEPRECATED_API */
    187 
    188 #ifndef U_FORCE_HIDE_DEPRECATED_API
    189  /**
    190   * Copy constructor.
    191   * @param copy The object to be copied.
    192   * @deprecated ICU 56 Use Normalizer2 instead.
    193   */
    194  Normalizer(const Normalizer& copy);
    195 
    196  /**
    197   * Destructor
    198   * @deprecated ICU 56 Use Normalizer2 instead.
    199   */
    200  virtual ~Normalizer();
    201 #endif  // U_FORCE_HIDE_DEPRECATED_API
    202 
    203  //-------------------------------------------------------------------------
    204  // Static utility methods
    205  //-------------------------------------------------------------------------
    206 
    207 #ifndef U_HIDE_DEPRECATED_API
    208  /**
    209   * Normalizes a <code>UnicodeString</code> according to the specified normalization mode.
    210   * This is a wrapper for unorm_normalize(), using UnicodeString's.
    211   *
    212   * The <code>options</code> parameter specifies which optional
    213   * <code>Normalizer</code> features are to be enabled for this operation.
    214   *
    215   * @param source    the input string to be normalized.
    216   * @param mode      the normalization mode
    217   * @param options   the optional features to be enabled (0 for no options)
    218   * @param result    The normalized string (on output).
    219   * @param status    The error code.
    220   * @deprecated ICU 56 Use Normalizer2 instead.
    221   */
    222  static void U_EXPORT2 normalize(const UnicodeString& source,
    223                        UNormalizationMode mode, int32_t options,
    224                        UnicodeString& result,
    225                        UErrorCode &status);
    226 
    227  /**
    228   * Compose a <code>UnicodeString</code>.
    229   * This is equivalent to normalize() with mode UNORM_NFC or UNORM_NFKC.
    230   * This is a wrapper for unorm_normalize(), using UnicodeString's.
    231   *
    232   * The <code>options</code> parameter specifies which optional
    233   * <code>Normalizer</code> features are to be enabled for this operation.
    234   *
    235   * @param source    the string to be composed.
    236   * @param compat    Perform compatibility decomposition before composition.
    237   *                  If this argument is <code>false</code>, only canonical
    238   *                  decomposition will be performed.
    239   * @param options   the optional features to be enabled (0 for no options)
    240   * @param result    The composed string (on output).
    241   * @param status    The error code.
    242   * @deprecated ICU 56 Use Normalizer2 instead.
    243   */
    244  static void U_EXPORT2 compose(const UnicodeString& source,
    245                      UBool compat, int32_t options,
    246                      UnicodeString& result,
    247                      UErrorCode &status);
    248 
    249  /**
    250   * Static method to decompose a <code>UnicodeString</code>.
    251   * This is equivalent to normalize() with mode UNORM_NFD or UNORM_NFKD.
    252   * This is a wrapper for unorm_normalize(), using UnicodeString's.
    253   *
    254   * The <code>options</code> parameter specifies which optional
    255   * <code>Normalizer</code> features are to be enabled for this operation.
    256   *
    257   * @param source    the string to be decomposed.
    258   * @param compat    Perform compatibility decomposition.
    259   *                  If this argument is <code>false</code>, only canonical
    260   *                  decomposition will be performed.
    261   * @param options   the optional features to be enabled (0 for no options)
    262   * @param result    The decomposed string (on output).
    263   * @param status    The error code.
    264   * @deprecated ICU 56 Use Normalizer2 instead.
    265   */
    266  static void U_EXPORT2 decompose(const UnicodeString& source,
    267                        UBool compat, int32_t options,
    268                        UnicodeString& result,
    269                        UErrorCode &status);
    270 
    271  /**
    272   * Performing quick check on a string, to quickly determine if the string is
    273   * in a particular normalization format.
    274   * This is a wrapper for unorm_quickCheck(), using a UnicodeString.
    275   *
    276   * Three types of result can be returned UNORM_YES, UNORM_NO or
    277   * UNORM_MAYBE. Result UNORM_YES indicates that the argument
    278   * string is in the desired normalized format, UNORM_NO determines that
    279   * argument string is not in the desired normalized format. A
    280   * UNORM_MAYBE result indicates that a more thorough check is required,
    281   * the user may have to put the string in its normalized form and compare the
    282   * results.
    283   * @param source       string for determining if it is in a normalized format
    284   * @param mode         normalization format
    285   * @param status A reference to a UErrorCode to receive any errors
    286   * @return UNORM_YES, UNORM_NO or UNORM_MAYBE
    287   *
    288   * @see isNormalized
    289   * @deprecated ICU 56 Use Normalizer2 instead.
    290   */
    291  static inline UNormalizationCheckResult
    292  quickCheck(const UnicodeString &source, UNormalizationMode mode, UErrorCode &status);
    293 
    294  /**
    295   * Performing quick check on a string; same as the other version of quickCheck
    296   * but takes an extra options parameter like most normalization functions.
    297   *
    298   * @param source       string for determining if it is in a normalized format
    299   * @param mode         normalization format
    300   * @param options      the optional features to be enabled (0 for no options)
    301   * @param status A reference to a UErrorCode to receive any errors
    302   * @return UNORM_YES, UNORM_NO or UNORM_MAYBE
    303   *
    304   * @see isNormalized
    305   * @deprecated ICU 56 Use Normalizer2 instead.
    306   */
    307  static UNormalizationCheckResult
    308  quickCheck(const UnicodeString &source, UNormalizationMode mode, int32_t options, UErrorCode &status);
    309 
    310  /**
    311   * Test if a string is in a given normalization form.
    312   * This is semantically equivalent to source.equals(normalize(source, mode)) .
    313   *
    314   * Unlike unorm_quickCheck(), this function returns a definitive result,
    315   * never a "maybe".
    316   * For NFD, NFKD, and FCD, both functions work exactly the same.
    317   * For NFC and NFKC where quickCheck may return "maybe", this function will
    318   * perform further tests to arrive at a true/false result.
    319   *
    320   * @param src        String that is to be tested if it is in a normalization format.
    321   * @param mode       Which normalization form to test for.
    322   * @param errorCode  ICU error code in/out parameter.
    323   *                   Must fulfill U_SUCCESS before the function call.
    324   * @return Boolean value indicating whether the source string is in the
    325   *         "mode" normalization form.
    326   *
    327   * @see quickCheck
    328   * @deprecated ICU 56 Use Normalizer2 instead.
    329   */
    330  static inline UBool
    331  isNormalized(const UnicodeString &src, UNormalizationMode mode, UErrorCode &errorCode);
    332 
    333  /**
    334   * Test if a string is in a given normalization form; same as the other version of isNormalized
    335   * but takes an extra options parameter like most normalization functions.
    336   *
    337   * @param src        String that is to be tested if it is in a normalization format.
    338   * @param mode       Which normalization form to test for.
    339   * @param options      the optional features to be enabled (0 for no options)
    340   * @param errorCode  ICU error code in/out parameter.
    341   *                   Must fulfill U_SUCCESS before the function call.
    342   * @return Boolean value indicating whether the source string is in the
    343   *         "mode" normalization form.
    344   *
    345   * @see quickCheck
    346   * @deprecated ICU 56 Use Normalizer2 instead.
    347   */
    348  static UBool
    349  isNormalized(const UnicodeString &src, UNormalizationMode mode, int32_t options, UErrorCode &errorCode);
    350 
    351  /**
    352   * Concatenate normalized strings, making sure that the result is normalized as well.
    353   *
    354   * If both the left and the right strings are in
    355   * the normalization form according to "mode/options",
    356   * then the result will be
    357   *
    358   * \code
    359   *     dest=normalize(left+right, mode, options)
    360   * \endcode
    361   *
    362   * For details see unorm_concatenate in unorm.h.
    363   *
    364   * @param left Left source string.
    365   * @param right Right source string.
    366   * @param result The output string.
    367   * @param mode The normalization mode.
    368   * @param options A bit set of normalization options.
    369   * @param errorCode ICU error code in/out parameter.
    370   *                   Must fulfill U_SUCCESS before the function call.
    371   * @return result
    372   *
    373   * @see unorm_concatenate
    374   * @see normalize
    375   * @see unorm_next
    376   * @see unorm_previous
    377   *
    378   * @deprecated ICU 56 Use Normalizer2 instead.
    379   */
    380  static UnicodeString &
    381  U_EXPORT2 concatenate(const UnicodeString &left, const UnicodeString &right,
    382              UnicodeString &result,
    383              UNormalizationMode mode, int32_t options,
    384              UErrorCode &errorCode);
    385 #endif  /* U_HIDE_DEPRECATED_API */
    386 
    387  /**
    388   * Compare two strings for canonical equivalence.
    389   * Further options include case-insensitive comparison and
    390   * code point order (as opposed to code unit order).
    391   *
    392   * Canonical equivalence between two strings is defined as their normalized
    393   * forms (NFD or NFC) being identical.
    394   * This function compares strings incrementally instead of normalizing
    395   * (and optionally case-folding) both strings entirely,
    396   * improving performance significantly.
    397   *
    398   * Bulk normalization is only necessary if the strings do not fulfill the FCD
    399   * conditions. Only in this case, and only if the strings are relatively long,
    400   * is memory allocated temporarily.
    401   * For FCD strings and short non-FCD strings there is no memory allocation.
    402   *
    403   * Semantically, this is equivalent to
    404   *   strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2)))
    405   * where code point order and foldCase are all optional.
    406   *
    407   * UAX 21 2.5 Caseless Matching specifies that for a canonical caseless match
    408   * the case folding must be performed first, then the normalization.
    409   *
    410   * @param s1 First source string.
    411   * @param s2 Second source string.
    412   *
    413   * @param options A bit set of options:
    414   *   - U_FOLD_CASE_DEFAULT or 0 is used for default options:
    415   *     Case-sensitive comparison in code unit order, and the input strings
    416   *     are quick-checked for FCD.
    417   *
    418   *   - UNORM_INPUT_IS_FCD
    419   *     Set if the caller knows that both s1 and s2 fulfill the FCD conditions.
    420   *     If not set, the function will quickCheck for FCD
    421   *     and normalize if necessary.
    422   *
    423   *   - U_COMPARE_CODE_POINT_ORDER
    424   *     Set to choose code point order instead of code unit order
    425   *     (see u_strCompare for details).
    426   *
    427   *   - U_COMPARE_IGNORE_CASE
    428   *     Set to compare strings case-insensitively using case folding,
    429   *     instead of case-sensitively.
    430   *     If set, then the following case folding options are used.
    431   *
    432   *   - Options as used with case-insensitive comparisons, currently:
    433   *
    434   *   - U_FOLD_CASE_EXCLUDE_SPECIAL_I
    435   *    (see u_strCaseCompare for details)
    436   *
    437   *   - regular normalization options shifted left by UNORM_COMPARE_NORM_OPTIONS_SHIFT
    438   *
    439   * @param errorCode ICU error code in/out parameter.
    440   *                  Must fulfill U_SUCCESS before the function call.
    441   * @return <0 or 0 or >0 as usual for string comparisons
    442   *
    443   * @see unorm_compare
    444   * @see normalize
    445   * @see UNORM_FCD
    446   * @see u_strCompare
    447   * @see u_strCaseCompare
    448   *
    449   * @stable ICU 2.2
    450   */
    451  static inline int32_t
    452  compare(const UnicodeString &s1, const UnicodeString &s2,
    453          uint32_t options,
    454          UErrorCode &errorCode);
    455 
    456 #ifndef U_HIDE_DEPRECATED_API
    457  //-------------------------------------------------------------------------
    458  // Iteration API
    459  //-------------------------------------------------------------------------
    460 
    461  /**
    462   * Return the current character in the normalized text.
    463   * current() may need to normalize some text at getIndex().
    464   * The getIndex() is not changed.
    465   *
    466   * @return the current normalized code point
    467   * @deprecated ICU 56 Use Normalizer2 instead.
    468   */
    469  UChar32 current();
    470 
    471  /**
    472   * Return the first character in the normalized text.
    473   * This is equivalent to setIndexOnly(startIndex()) followed by next().
    474   * (Post-increment semantics.)
    475   *
    476   * @return the first normalized code point
    477   * @deprecated ICU 56 Use Normalizer2 instead.
    478   */
    479  UChar32 first();
    480 
    481  /**
    482   * Return the last character in the normalized text.
    483   * This is equivalent to setIndexOnly(endIndex()) followed by previous().
    484   * (Pre-decrement semantics.)
    485   *
    486   * @return the last normalized code point
    487   * @deprecated ICU 56 Use Normalizer2 instead.
    488   */
    489  UChar32 last();
    490 
    491  /**
    492   * Return the next character in the normalized text.
    493   * (Post-increment semantics.)
    494   * If the end of the text has already been reached, DONE is returned.
    495   * The DONE value could be confused with a U+FFFF non-character code point
    496   * in the text. If this is possible, you can test getIndex()<endIndex()
    497   * before calling next(), or (getIndex()<endIndex() || last()!=DONE)
    498   * after calling next(). (Calling last() will change the iterator state!)
    499   *
    500   * The C API unorm_next() is more efficient and does not have this ambiguity.
    501   *
    502   * @return the next normalized code point
    503   * @deprecated ICU 56 Use Normalizer2 instead.
    504   */
    505  UChar32 next();
    506 
    507  /**
    508   * Return the previous character in the normalized text and decrement.
    509   * (Pre-decrement semantics.)
    510   * If the beginning of the text has already been reached, DONE is returned.
    511   * The DONE value could be confused with a U+FFFF non-character code point
    512   * in the text. If this is possible, you can test
    513   * (getIndex()>startIndex() || first()!=DONE). (Calling first() will change
    514   * the iterator state!)
    515   *
    516   * The C API unorm_previous() is more efficient and does not have this ambiguity.
    517   *
    518   * @return the previous normalized code point
    519   * @deprecated ICU 56 Use Normalizer2 instead.
    520   */
    521  UChar32 previous();
    522 
    523  /**
    524   * Set the iteration position in the input text that is being normalized,
    525   * without any immediate normalization.
    526   * After setIndexOnly(), getIndex() will return the same index that is
    527   * specified here.
    528   *
    529   * @param index the desired index in the input text.
    530   * @deprecated ICU 56 Use Normalizer2 instead.
    531   */
    532  void                 setIndexOnly(int32_t index);
    533 
    534  /**
    535   * Reset the index to the beginning of the text.
    536   * This is equivalent to setIndexOnly(startIndex)).
    537   * @deprecated ICU 56 Use Normalizer2 instead.
    538   */
    539  void reset();
    540 
    541  /**
    542   * Retrieve the current iteration position in the input text that is
    543   * being normalized.
    544   *
    545   * A following call to next() will return a normalized code point from
    546   * the input text at or after this index.
    547   *
    548   * After a call to previous(), getIndex() will point at or before the
    549   * position in the input text where the normalized code point
    550   * was returned from with previous().
    551   *
    552   * @return the current index in the input text
    553   * @deprecated ICU 56 Use Normalizer2 instead.
    554   */
    555  int32_t getIndex() const;
    556 
    557  /**
    558   * Retrieve the index of the start of the input text. This is the begin index
    559   * of the <code>CharacterIterator</code> or the start (i.e. index 0) of the string
    560   * over which this <code>Normalizer</code> is iterating.
    561   *
    562   * @return the smallest index in the input text where the Normalizer operates
    563   * @deprecated ICU 56 Use Normalizer2 instead.
    564   */
    565  int32_t startIndex() const;
    566 
    567  /**
    568   * Retrieve the index of the end of the input text. This is the end index
    569   * of the <code>CharacterIterator</code> or the length of the string
    570   * over which this <code>Normalizer</code> is iterating.
    571   * This end index is exclusive, i.e., the Normalizer operates only on characters
    572   * before this index.
    573   *
    574   * @return the first index in the input text where the Normalizer does not operate
    575   * @deprecated ICU 56 Use Normalizer2 instead.
    576   */
    577  int32_t endIndex() const;
    578 
    579  /**
    580   * Returns true when both iterators refer to the same character in the same
    581   * input text.
    582   *
    583   * @param that a Normalizer object to compare this one to
    584   * @return comparison result
    585   * @deprecated ICU 56 Use Normalizer2 instead.
    586   */
    587  bool         operator==(const Normalizer& that) const;
    588 
    589  /**
    590   * Returns false when both iterators refer to the same character in the same
    591   * input text.
    592   *
    593   * @param that a Normalizer object to compare this one to
    594   * @return comparison result
    595   * @deprecated ICU 56 Use Normalizer2 instead.
    596   */
    597  inline bool         operator!=(const Normalizer& that) const;
    598 
    599  /**
    600   * Returns a pointer to a new Normalizer that is a clone of this one.
    601   * The caller is responsible for deleting the new clone.
    602   * @return a pointer to a new Normalizer
    603   * @deprecated ICU 56 Use Normalizer2 instead.
    604   */
    605  Normalizer*        clone() const;
    606 
    607  /**
    608   * Generates a hash code for this iterator.
    609   *
    610   * @return the hash code
    611   * @deprecated ICU 56 Use Normalizer2 instead.
    612   */
    613  int32_t hashCode() const;
    614 
    615  //-------------------------------------------------------------------------
    616  // Property access methods
    617  //-------------------------------------------------------------------------
    618 
    619  /**
    620   * Set the normalization mode for this object.
    621   * <p>
    622   * <b>Note:</b>If the normalization mode is changed while iterating
    623   * over a string, calls to {@link #next() } and {@link #previous() } may
    624   * return previously buffers characters in the old normalization mode
    625   * until the iteration is able to re-sync at the next base character.
    626   * It is safest to call {@link #setIndexOnly }, {@link #reset() },
    627   * {@link #setText }, {@link #first() },
    628   * {@link #last() }, etc. after calling <code>setMode</code>.
    629   * <p>
    630   * @param newMode the new mode for this <code>Normalizer</code>.
    631   * @see #getUMode
    632   * @deprecated ICU 56 Use Normalizer2 instead.
    633   */
    634  void setMode(UNormalizationMode newMode);
    635 
    636  /**
    637   * Return the normalization mode for this object.
    638   *
    639   * This is an unusual name because there used to be a getMode() that
    640   * returned a different type.
    641   *
    642   * @return the mode for this <code>Normalizer</code>
    643   * @see #setMode
    644   * @deprecated ICU 56 Use Normalizer2 instead.
    645   */
    646  UNormalizationMode getUMode() const;
    647 
    648  /**
    649   * Set options that affect this <code>Normalizer</code>'s operation.
    650   * Options do not change the basic composition or decomposition operation
    651   * that is being performed, but they control whether
    652   * certain optional portions of the operation are done.
    653   * Currently the only available option is obsolete.
    654   *
    655   * It is possible to specify multiple options that are all turned on or off.
    656   *
    657   * @param   option  the option(s) whose value is/are to be set.
    658   * @param   value   the new setting for the option.  Use <code>true</code> to
    659   *                  turn the option(s) on and <code>false</code> to turn it/them off.
    660   *
    661   * @see #getOption
    662   * @deprecated ICU 56 Use Normalizer2 instead.
    663   */
    664  void setOption(int32_t option,
    665         UBool value);
    666 
    667  /**
    668   * Determine whether an option is turned on or off.
    669   * If multiple options are specified, then the result is true if any
    670   * of them are set.
    671   * <p>
    672   * @param option the option(s) that are to be checked
    673   * @return true if any of the option(s) are set
    674   * @see #setOption
    675   * @deprecated ICU 56 Use Normalizer2 instead.
    676   */
    677  UBool getOption(int32_t option) const;
    678 
    679  /**
    680   * Set the input text over which this <code>Normalizer</code> will iterate.
    681   * The iteration position is set to the beginning.
    682   *
    683   * @param newText a string that replaces the current input text
    684   * @param status a UErrorCode
    685   * @deprecated ICU 56 Use Normalizer2 instead.
    686   */
    687  void setText(const UnicodeString& newText,
    688           UErrorCode &status);
    689 
    690  /**
    691   * Set the input text over which this <code>Normalizer</code> will iterate.
    692   * The iteration position is set to the beginning.
    693   *
    694   * @param newText a CharacterIterator object that replaces the current input text
    695   * @param status a UErrorCode
    696   * @deprecated ICU 56 Use Normalizer2 instead.
    697   */
    698  void setText(const CharacterIterator& newText,
    699           UErrorCode &status);
    700 
    701  /**
    702   * Set the input text over which this <code>Normalizer</code> will iterate.
    703   * The iteration position is set to the beginning.
    704   *
    705   * @param newText a string that replaces the current input text
    706   * @param length the length of the string, or -1 if NUL-terminated
    707   * @param status a UErrorCode
    708   * @deprecated ICU 56 Use Normalizer2 instead.
    709   */
    710  void setText(ConstChar16Ptr newText,
    711                    int32_t length,
    712            UErrorCode &status);
    713  /**
    714   * Copies the input text into the UnicodeString argument.
    715   *
    716   * @param result Receives a copy of the text under iteration.
    717   * @deprecated ICU 56 Use Normalizer2 instead.
    718   */
    719  void            getText(UnicodeString&  result);
    720 
    721  /**
    722   * ICU "poor man's RTTI", returns a UClassID for this class.
    723   * @returns a UClassID for this class.
    724   * @deprecated ICU 56 Use Normalizer2 instead.
    725   */
    726  static UClassID U_EXPORT2 getStaticClassID();
    727 #endif  /* U_HIDE_DEPRECATED_API */
    728 
    729 #ifndef U_FORCE_HIDE_DEPRECATED_API
    730  /**
    731   * ICU "poor man's RTTI", returns a UClassID for the actual class.
    732   * @return a UClassID for the actual class.
    733   * @deprecated ICU 56 Use Normalizer2 instead.
    734   */
    735  virtual UClassID getDynamicClassID() const override;
    736 #endif  // U_FORCE_HIDE_DEPRECATED_API
    737 
    738 private:
    739  //-------------------------------------------------------------------------
    740  // Private functions
    741  //-------------------------------------------------------------------------
    742 
    743  Normalizer() = delete; // default constructor not implemented
    744  Normalizer &operator=(const Normalizer &that) = delete; // assignment operator not implemented
    745 
    746  // Private utility methods for iteration
    747  // For documentation, see the source code
    748  UBool nextNormalize();
    749  UBool previousNormalize();
    750 
    751  void    init();
    752  void clearBuffer();
    753 
    754  //-------------------------------------------------------------------------
    755  // Private data
    756  //-------------------------------------------------------------------------
    757 
    758  FilteredNormalizer2*fFilteredNorm2;  // owned if not nullptr
    759  const Normalizer2  *fNorm2;  // not owned; may be equal to fFilteredNorm2
    760  UNormalizationMode  fUMode;  // deprecated
    761  int32_t             fOptions;
    762 
    763  // The input text and our position in it
    764  CharacterIterator  *text;
    765 
    766  // The normalization buffer is the result of normalization
    767  // of the source in [currentIndex..nextIndex[ .
    768  int32_t         currentIndex, nextIndex;
    769 
    770  // A buffer for holding intermediate results
    771  UnicodeString       buffer;
    772  int32_t         bufferPos;
    773 };
    774 
    775 //-------------------------------------------------------------------------
    776 // Inline implementations
    777 //-------------------------------------------------------------------------
    778 
    779 #ifndef U_HIDE_DEPRECATED_API
    780 inline bool
    781 Normalizer::operator!= (const Normalizer& other) const
    782 { return ! operator==(other); }
    783 
    784 inline UNormalizationCheckResult
    785 Normalizer::quickCheck(const UnicodeString& source,
    786                       UNormalizationMode mode,
    787                       UErrorCode &status) {
    788    return quickCheck(source, mode, 0, status);
    789 }
    790 
    791 inline UBool
    792 Normalizer::isNormalized(const UnicodeString& source,
    793                         UNormalizationMode mode,
    794                         UErrorCode &status) {
    795    return isNormalized(source, mode, 0, status);
    796 }
    797 #endif  /* U_HIDE_DEPRECATED_API */
    798 
    799 inline int32_t
    800 Normalizer::compare(const UnicodeString &s1, const UnicodeString &s2,
    801                    uint32_t options,
    802                    UErrorCode &errorCode) {
    803  // all argument checking is done in unorm_compare
    804  return unorm_compare(toUCharPtr(s1.getBuffer()), s1.length(),
    805                       toUCharPtr(s2.getBuffer()), s2.length(),
    806                       options,
    807                       &errorCode);
    808 }
    809 
    810 U_NAMESPACE_END
    811 
    812 #endif /* #if !UCONFIG_NO_NORMALIZATION */
    813 
    814 #endif // NORMLZR_H
    815 
    816 #endif /* U_SHOW_CPLUSPLUS_API */
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE