[ tor-browser ].git.dasho

normalizer2.h (35516B)
      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 2009-2013, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  normalizer2.h
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2009nov22
     16 *   created by: Markus W. Scherer
     17 */
     18 
     19 #ifndef __NORMALIZER2_H__
     20 #define __NORMALIZER2_H__
     21 
     22 /**
     23 * \file
     24 * \brief C++ API: New API for Unicode Normalization.
     25 */
     26 
     27 #include "unicode/utypes.h"
     28 
     29 #if U_SHOW_CPLUSPLUS_API
     30 
     31 #if !UCONFIG_NO_NORMALIZATION
     32 
     33 #include "unicode/stringpiece.h"
     34 #include "unicode/uniset.h"
     35 #include "unicode/unistr.h"
     36 #include "unicode/unorm2.h"
     37 
     38 U_NAMESPACE_BEGIN
     39 
     40 class ByteSink;
     41 
     42 /**
     43 * Unicode normalization functionality for standard Unicode normalization or
     44 * for using custom mapping tables.
     45 * All instances of this class are unmodifiable/immutable.
     46 * Instances returned by getInstance() are singletons that must not be deleted by the caller.
     47 * The Normalizer2 class is not intended for public subclassing.
     48 *
     49 * The primary functions are to produce a normalized string and to detect whether
     50 * a string is already normalized.
     51 * The most commonly used normalization forms are those defined in
     52 * http://www.unicode.org/unicode/reports/tr15/
     53 * However, this API supports additional normalization forms for specialized purposes.
     54 * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
     55 * and can be used in implementations of UTS #46.
     56 *
     57 * Not only are the standard compose and decompose modes supplied,
     58 * but additional modes are provided as documented in the Mode enum.
     59 *
     60 * Some of the functions in this class identify normalization boundaries.
     61 * At a normalization boundary, the portions of the string
     62 * before it and starting from it do not interact and can be handled independently.
     63 *
     64 * The spanQuickCheckYes() stops at a normalization boundary.
     65 * When the goal is a normalized string, then the text before the boundary
     66 * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
     67 *
     68 * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
     69 * a character is guaranteed to be at a normalization boundary,
     70 * regardless of context.
     71 * This is used for moving from one normalization boundary to the next
     72 * or preceding boundary, and for performing iterative normalization.
     73 *
     74 * Iterative normalization is useful when only a small portion of a
     75 * longer string needs to be processed.
     76 * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
     77 * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
     78 * (to process only the substring for which sort key bytes are computed).
     79 *
     80 * The set of normalization boundaries returned by these functions may not be
     81 * complete: There may be more boundaries that could be returned.
     82 * Different functions may return different boundaries.
     83 * @stable ICU 4.4
     84 */
     85 class U_COMMON_API Normalizer2 : public UObject {
     86 public:
     87    /**
     88     * Destructor.
     89     * @stable ICU 4.4
     90     */
     91    ~Normalizer2();
     92 
     93    /**
     94     * Returns a Normalizer2 instance for Unicode NFC normalization.
     95     * Same as getInstance(nullptr, "nfc", UNORM2_COMPOSE, errorCode).
     96     * Returns an unmodifiable singleton instance. Do not delete it.
     97     * @param errorCode Standard ICU error code. Its input value must
     98     *                  pass the U_SUCCESS() test, or else the function returns
     99     *                  immediately. Check for U_FAILURE() on output or use with
    100     *                  function chaining. (See User Guide for details.)
    101     * @return the requested Normalizer2, if successful
    102     * @stable ICU 49
    103     */
    104    static const Normalizer2 *
    105    getNFCInstance(UErrorCode &errorCode);
    106 
    107    /**
    108     * Returns a Normalizer2 instance for Unicode NFD normalization.
    109     * Same as getInstance(nullptr, "nfc", UNORM2_DECOMPOSE, errorCode).
    110     * Returns an unmodifiable singleton instance. Do not delete it.
    111     * @param errorCode Standard ICU error code. Its input value must
    112     *                  pass the U_SUCCESS() test, or else the function returns
    113     *                  immediately. Check for U_FAILURE() on output or use with
    114     *                  function chaining. (See User Guide for details.)
    115     * @return the requested Normalizer2, if successful
    116     * @stable ICU 49
    117     */
    118    static const Normalizer2 *
    119    getNFDInstance(UErrorCode &errorCode);
    120 
    121    /**
    122     * Returns a Normalizer2 instance for Unicode NFKC normalization.
    123     * Same as getInstance(nullptr, "nfkc", UNORM2_COMPOSE, errorCode).
    124     * Returns an unmodifiable singleton instance. Do not delete it.
    125     * @param errorCode Standard ICU error code. Its input value must
    126     *                  pass the U_SUCCESS() test, or else the function returns
    127     *                  immediately. Check for U_FAILURE() on output or use with
    128     *                  function chaining. (See User Guide for details.)
    129     * @return the requested Normalizer2, if successful
    130     * @stable ICU 49
    131     */
    132    static const Normalizer2 *
    133    getNFKCInstance(UErrorCode &errorCode);
    134 
    135    /**
    136     * Returns a Normalizer2 instance for Unicode NFKD normalization.
    137     * Same as getInstance(nullptr, "nfkc", UNORM2_DECOMPOSE, errorCode).
    138     * Returns an unmodifiable singleton instance. Do not delete it.
    139     * @param errorCode Standard ICU error code. Its input value must
    140     *                  pass the U_SUCCESS() test, or else the function returns
    141     *                  immediately. Check for U_FAILURE() on output or use with
    142     *                  function chaining. (See User Guide for details.)
    143     * @return the requested Normalizer2, if successful
    144     * @stable ICU 49
    145     */
    146    static const Normalizer2 *
    147    getNFKDInstance(UErrorCode &errorCode);
    148 
    149    /**
    150     * Returns a Normalizer2 instance for Unicode toNFKC_Casefold() normalization
    151     * which is equivalent to applying the NFKC_Casefold mappings and then NFC.
    152     * See https://www.unicode.org/reports/tr44/#NFKC_Casefold
    153     *
    154     * Same as getInstance(nullptr, "nfkc_cf", UNORM2_COMPOSE, errorCode).
    155     * Returns an unmodifiable singleton instance. Do not delete it.
    156     * @param errorCode Standard ICU error code. Its input value must
    157     *                  pass the U_SUCCESS() test, or else the function returns
    158     *                  immediately. Check for U_FAILURE() on output or use with
    159     *                  function chaining. (See User Guide for details.)
    160     * @return the requested Normalizer2, if successful
    161     * @stable ICU 49
    162     */
    163    static const Normalizer2 *
    164    getNFKCCasefoldInstance(UErrorCode &errorCode);
    165 
    166    /**
    167     * Returns a Normalizer2 instance for a variant of Unicode toNFKC_Casefold() normalization
    168     * which is equivalent to applying the NFKC_Simple_Casefold mappings and then NFC.
    169     * See https://www.unicode.org/reports/tr44/#NFKC_Simple_Casefold
    170     *
    171     * Same as getInstance(nullptr, "nfkc_scf", UNORM2_COMPOSE, errorCode).
    172     * Returns an unmodifiable singleton instance. Do not delete it.
    173     * @param errorCode Standard ICU error code. Its input value must
    174     *                  pass the U_SUCCESS() test, or else the function returns
    175     *                  immediately. Check for U_FAILURE() on output or use with
    176     *                  function chaining. (See User Guide for details.)
    177     * @return the requested Normalizer2, if successful
    178     * @stable ICU 74
    179     */
    180    static const Normalizer2 *
    181    getNFKCSimpleCasefoldInstance(UErrorCode &errorCode);
    182 
    183    /**
    184     * Returns a Normalizer2 instance which uses the specified data file
    185     * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
    186     * and which composes or decomposes text according to the specified mode.
    187     * Returns an unmodifiable singleton instance. Do not delete it.
    188     *
    189     * Use packageName=nullptr for data files that are part of ICU's own data.
    190     * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
    191     * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
    192     * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
    193     *
    194     * @param packageName nullptr for ICU built-in data, otherwise application data package name
    195     * @param name "nfc" or "nfkc" or "nfkc_cf" or "nfkc_scf" or name of custom data file
    196     * @param mode normalization mode (compose or decompose etc.)
    197     * @param errorCode Standard ICU error code. Its input value must
    198     *                  pass the U_SUCCESS() test, or else the function returns
    199     *                  immediately. Check for U_FAILURE() on output or use with
    200     *                  function chaining. (See User Guide for details.)
    201     * @return the requested Normalizer2, if successful
    202     * @stable ICU 4.4
    203     */
    204    static const Normalizer2 *
    205    getInstance(const char *packageName,
    206                const char *name,
    207                UNormalization2Mode mode,
    208                UErrorCode &errorCode);
    209 
    210    /**
    211     * Returns the normalized form of the source string.
    212     * @param src source string
    213     * @param errorCode Standard ICU error code. Its input value must
    214     *                  pass the U_SUCCESS() test, or else the function returns
    215     *                  immediately. Check for U_FAILURE() on output or use with
    216     *                  function chaining. (See User Guide for details.)
    217     * @return normalized src
    218     * @stable ICU 4.4
    219     */
    220    UnicodeString
    221    normalize(const UnicodeString &src, UErrorCode &errorCode) const {
    222        UnicodeString result;
    223        normalize(src, result, errorCode);
    224        return result;
    225    }
    226    /**
    227     * Writes the normalized form of the source string to the destination string
    228     * (replacing its contents) and returns the destination string.
    229     * The source and destination strings must be different objects.
    230     * @param src source string
    231     * @param dest destination string; its contents is replaced with normalized src
    232     * @param errorCode Standard ICU error code. Its input value must
    233     *                  pass the U_SUCCESS() test, or else the function returns
    234     *                  immediately. Check for U_FAILURE() on output or use with
    235     *                  function chaining. (See User Guide for details.)
    236     * @return dest
    237     * @stable ICU 4.4
    238     */
    239    virtual UnicodeString &
    240    normalize(const UnicodeString &src,
    241              UnicodeString &dest,
    242              UErrorCode &errorCode) const = 0;
    243 
    244    /**
    245     * Normalizes a UTF-8 string and optionally records how source substrings
    246     * relate to changed and unchanged result substrings.
    247     *
    248     * Implemented completely for all built-in modes except for FCD.
    249     * The base class implementation converts to & from UTF-16 and does not support edits.
    250     *
    251     * @param options   Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
    252     * @param src       Source UTF-8 string.
    253     * @param sink      A ByteSink to which the normalized UTF-8 result string is written.
    254     *                  sink.Flush() is called at the end.
    255     * @param edits     Records edits for index mapping, working with styled text,
    256     *                  and getting only changes (if any).
    257     *                  The Edits contents is undefined if any error occurs.
    258     *                  This function calls edits->reset() first unless
    259     *                  options includes U_EDITS_NO_RESET. edits can be nullptr.
    260     * @param errorCode Standard ICU error code. Its input value must
    261     *                  pass the U_SUCCESS() test, or else the function returns
    262     *                  immediately. Check for U_FAILURE() on output or use with
    263     *                  function chaining. (See User Guide for details.)
    264     * @stable ICU 60
    265     */
    266    virtual void
    267    normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
    268                  Edits *edits, UErrorCode &errorCode) const;
    269 
    270    /**
    271     * Appends the normalized form of the second string to the first string
    272     * (merging them at the boundary) and returns the first string.
    273     * The result is normalized if the first string was normalized.
    274     * The first and second strings must be different objects.
    275     * @param first string, should be normalized
    276     * @param second string, will be normalized
    277     * @param errorCode Standard ICU error code. Its input value must
    278     *                  pass the U_SUCCESS() test, or else the function returns
    279     *                  immediately. Check for U_FAILURE() on output or use with
    280     *                  function chaining. (See User Guide for details.)
    281     * @return first
    282     * @stable ICU 4.4
    283     */
    284    virtual UnicodeString &
    285    normalizeSecondAndAppend(UnicodeString &first,
    286                             const UnicodeString &second,
    287                             UErrorCode &errorCode) const = 0;
    288    /**
    289     * Appends the second string to the first string
    290     * (merging them at the boundary) and returns the first string.
    291     * The result is normalized if both the strings were normalized.
    292     * The first and second strings must be different objects.
    293     * @param first string, should be normalized
    294     * @param second string, should be normalized
    295     * @param errorCode Standard ICU error code. Its input value must
    296     *                  pass the U_SUCCESS() test, or else the function returns
    297     *                  immediately. Check for U_FAILURE() on output or use with
    298     *                  function chaining. (See User Guide for details.)
    299     * @return first
    300     * @stable ICU 4.4
    301     */
    302    virtual UnicodeString &
    303    append(UnicodeString &first,
    304           const UnicodeString &second,
    305           UErrorCode &errorCode) const = 0;
    306 
    307    /**
    308     * Gets the decomposition mapping of c.
    309     * Roughly equivalent to normalizing the String form of c
    310     * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function
    311     * returns false and does not write a string
    312     * if c does not have a decomposition mapping in this instance's data.
    313     * This function is independent of the mode of the Normalizer2.
    314     * @param c code point
    315     * @param decomposition String object which will be set to c's
    316     *                      decomposition mapping, if there is one.
    317     * @return true if c has a decomposition, otherwise false
    318     * @stable ICU 4.6
    319     */
    320    virtual UBool
    321    getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
    322 
    323    /**
    324     * Gets the raw decomposition mapping of c.
    325     *
    326     * This is similar to the getDecomposition() method but returns the
    327     * raw decomposition mapping as specified in UnicodeData.txt or
    328     * (for custom data) in the mapping files processed by the gennorm2 tool.
    329     * By contrast, getDecomposition() returns the processed,
    330     * recursively-decomposed version of this mapping.
    331     *
    332     * When used on a standard NFKC Normalizer2 instance,
    333     * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
    334     *
    335     * When used on a standard NFC Normalizer2 instance,
    336     * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
    337     * in this case, the result contains either one or two code points (=1..4 char16_ts).
    338     *
    339     * This function is independent of the mode of the Normalizer2.
    340     * The default implementation returns false.
    341     * @param c code point
    342     * @param decomposition String object which will be set to c's
    343     *                      raw decomposition mapping, if there is one.
    344     * @return true if c has a decomposition, otherwise false
    345     * @stable ICU 49
    346     */
    347    virtual UBool
    348    getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
    349 
    350    /**
    351     * Performs pairwise composition of a & b and returns the composite if there is one.
    352     *
    353     * Returns a composite code point c only if c has a two-way mapping to a+b.
    354     * In standard Unicode normalization, this means that
    355     * c has a canonical decomposition to a+b
    356     * and c does not have the Full_Composition_Exclusion property.
    357     *
    358     * This function is independent of the mode of the Normalizer2.
    359     * The default implementation returns a negative value.
    360     * @param a A (normalization starter) code point.
    361     * @param b Another code point.
    362     * @return The non-negative composite code point if there is one; otherwise a negative value.
    363     * @stable ICU 49
    364     */
    365    virtual UChar32
    366    composePair(UChar32 a, UChar32 b) const;
    367 
    368    /**
    369     * Gets the combining class of c.
    370     * The default implementation returns 0
    371     * but all standard implementations return the Unicode Canonical_Combining_Class value.
    372     * @param c code point
    373     * @return c's combining class
    374     * @stable ICU 49
    375     */
    376    virtual uint8_t
    377    getCombiningClass(UChar32 c) const;
    378 
    379    /**
    380     * Tests if the string is normalized.
    381     * Internally, in cases where the quickCheck() method would return "maybe"
    382     * (which is only possible for the two COMPOSE modes) this method
    383     * resolves to "yes" or "no" to provide a definitive result,
    384     * at the cost of doing more work in those cases.
    385     * @param s input string
    386     * @param errorCode Standard ICU error code. Its input value must
    387     *                  pass the U_SUCCESS() test, or else the function returns
    388     *                  immediately. Check for U_FAILURE() on output or use with
    389     *                  function chaining. (See User Guide for details.)
    390     * @return true if s is normalized
    391     * @stable ICU 4.4
    392     */
    393    virtual UBool
    394    isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
    395    /**
    396     * Tests if the UTF-8 string is normalized.
    397     * Internally, in cases where the quickCheck() method would return "maybe"
    398     * (which is only possible for the two COMPOSE modes) this method
    399     * resolves to "yes" or "no" to provide a definitive result,
    400     * at the cost of doing more work in those cases.
    401     *
    402     * This works for all normalization modes.
    403     * It is optimized for UTF-8 for all built-in modes except for FCD.
    404     * The base class implementation converts to UTF-16 and calls isNormalized().
    405     *
    406     * @param s UTF-8 input string
    407     * @param errorCode Standard ICU error code. Its input value must
    408     *                  pass the U_SUCCESS() test, or else the function returns
    409     *                  immediately. Check for U_FAILURE() on output or use with
    410     *                  function chaining. (See User Guide for details.)
    411     * @return true if s is normalized
    412     * @stable ICU 60
    413     */
    414    virtual UBool
    415    isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const;
    416 
    417 
    418    /**
    419     * Tests if the string is normalized.
    420     * For the two COMPOSE modes, the result could be "maybe" in cases that
    421     * would take a little more work to resolve definitively.
    422     * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
    423     * combination of quick check + normalization, to avoid
    424     * re-checking the "yes" prefix.
    425     * @param s input string
    426     * @param errorCode Standard ICU error code. Its input value must
    427     *                  pass the U_SUCCESS() test, or else the function returns
    428     *                  immediately. Check for U_FAILURE() on output or use with
    429     *                  function chaining. (See User Guide for details.)
    430     * @return UNormalizationCheckResult
    431     * @stable ICU 4.4
    432     */
    433    virtual UNormalizationCheckResult
    434    quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
    435 
    436    /**
    437     * Returns the end of the normalized substring of the input string.
    438     * In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
    439     * the substring <code>UnicodeString(s, 0, end)</code>
    440     * will pass the quick check with a "yes" result.
    441     *
    442     * The returned end index is usually one or more characters before the
    443     * "no" or "maybe" character: The end index is at a normalization boundary.
    444     * (See the class documentation for more about normalization boundaries.)
    445     *
    446     * When the goal is a normalized string and most input strings are expected
    447     * to be normalized already, then call this method,
    448     * and if it returns a prefix shorter than the input string,
    449     * copy that prefix and use normalizeSecondAndAppend() for the remainder.
    450     * @param s input string
    451     * @param errorCode Standard ICU error code. Its input value must
    452     *                  pass the U_SUCCESS() test, or else the function returns
    453     *                  immediately. Check for U_FAILURE() on output or use with
    454     *                  function chaining. (See User Guide for details.)
    455     * @return "yes" span end index
    456     * @stable ICU 4.4
    457     */
    458    virtual int32_t
    459    spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
    460 
    461    /**
    462     * Tests if the character always has a normalization boundary before it,
    463     * regardless of context.
    464     * If true, then the character does not normalization-interact with
    465     * preceding characters.
    466     * In other words, a string containing this character can be normalized
    467     * by processing portions before this character and starting from this
    468     * character independently.
    469     * This is used for iterative normalization. See the class documentation for details.
    470     * @param c character to test
    471     * @return true if c has a normalization boundary before it
    472     * @stable ICU 4.4
    473     */
    474    virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
    475 
    476    /**
    477     * Tests if the character always has a normalization boundary after it,
    478     * regardless of context.
    479     * If true, then the character does not normalization-interact with
    480     * following characters.
    481     * In other words, a string containing this character can be normalized
    482     * by processing portions up to this character and after this
    483     * character independently.
    484     * This is used for iterative normalization. See the class documentation for details.
    485     * Note that this operation may be significantly slower than hasBoundaryBefore().
    486     * @param c character to test
    487     * @return true if c has a normalization boundary after it
    488     * @stable ICU 4.4
    489     */
    490    virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
    491 
    492    /**
    493     * Tests if the character is normalization-inert.
    494     * If true, then the character does not change, nor normalization-interact with
    495     * preceding or following characters.
    496     * In other words, a string containing this character can be normalized
    497     * by processing portions before this character and after this
    498     * character independently.
    499     * This is used for iterative normalization. See the class documentation for details.
    500     * Note that this operation may be significantly slower than hasBoundaryBefore().
    501     * @param c character to test
    502     * @return true if c is normalization-inert
    503     * @stable ICU 4.4
    504     */
    505    virtual UBool isInert(UChar32 c) const = 0;
    506 };
    507 
    508 /**
    509 * Normalization filtered by a UnicodeSet.
    510 * Normalizes portions of the text contained in the filter set and leaves
    511 * portions not contained in the filter set unchanged.
    512 * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).
    513 * Not-in-the-filter text is treated as "is normalized" and "quick check yes".
    514 * This class implements all of (and only) the Normalizer2 API.
    515 * An instance of this class is unmodifiable/immutable but is constructed and
    516 * must be destructed by the owner.
    517 * @stable ICU 4.4
    518 */
    519 class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {
    520 public:
    521    /**
    522     * Constructs a filtered normalizer wrapping any Normalizer2 instance
    523     * and a filter set.
    524     * Both are aliased and must not be modified or deleted while this object
    525     * is used.
    526     * The filter set should be frozen; otherwise the performance will suffer greatly.
    527     * @param n2 wrapped Normalizer2 instance
    528     * @param filterSet UnicodeSet which determines the characters to be normalized
    529     * @stable ICU 4.4
    530     */
    531    FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
    532            norm2(n2), set(filterSet) {}
    533 
    534    /**
    535     * Destructor.
    536     * @stable ICU 4.4
    537     */
    538    ~FilteredNormalizer2();
    539 
    540    /**
    541     * Writes the normalized form of the source string to the destination string
    542     * (replacing its contents) and returns the destination string.
    543     * The source and destination strings must be different objects.
    544     * @param src source string
    545     * @param dest destination string; its contents is replaced with normalized src
    546     * @param errorCode Standard ICU error code. Its input value must
    547     *                  pass the U_SUCCESS() test, or else the function returns
    548     *                  immediately. Check for U_FAILURE() on output or use with
    549     *                  function chaining. (See User Guide for details.)
    550     * @return dest
    551     * @stable ICU 4.4
    552     */
    553    virtual UnicodeString &
    554    normalize(const UnicodeString &src,
    555              UnicodeString &dest,
    556              UErrorCode &errorCode) const override;
    557 
    558    /**
    559     * Normalizes a UTF-8 string and optionally records how source substrings
    560     * relate to changed and unchanged result substrings.
    561     *
    562     * Implemented completely for most built-in modes except for FCD.
    563     * The base class implementation converts to & from UTF-16 and does not support edits.
    564     *
    565     * @param options   Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
    566     * @param src       Source UTF-8 string.
    567     * @param sink      A ByteSink to which the normalized UTF-8 result string is written.
    568     *                  sink.Flush() is called at the end.
    569     * @param edits     Records edits for index mapping, working with styled text,
    570     *                  and getting only changes (if any).
    571     *                  The Edits contents is undefined if any error occurs.
    572     *                  This function calls edits->reset() first unless
    573     *                  options includes U_EDITS_NO_RESET. edits can be nullptr.
    574     * @param errorCode Standard ICU error code. Its input value must
    575     *                  pass the U_SUCCESS() test, or else the function returns
    576     *                  immediately. Check for U_FAILURE() on output or use with
    577     *                  function chaining. (See User Guide for details.)
    578     * @stable ICU 60
    579     */
    580    virtual void
    581    normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
    582                  Edits *edits, UErrorCode &errorCode) const override;
    583 
    584    /**
    585     * Appends the normalized form of the second string to the first string
    586     * (merging them at the boundary) and returns the first string.
    587     * The result is normalized if the first string was normalized.
    588     * The first and second strings must be different objects.
    589     * @param first string, should be normalized
    590     * @param second string, will be normalized
    591     * @param errorCode Standard ICU error code. Its input value must
    592     *                  pass the U_SUCCESS() test, or else the function returns
    593     *                  immediately. Check for U_FAILURE() on output or use with
    594     *                  function chaining. (See User Guide for details.)
    595     * @return first
    596     * @stable ICU 4.4
    597     */
    598    virtual UnicodeString &
    599    normalizeSecondAndAppend(UnicodeString &first,
    600                             const UnicodeString &second,
    601                             UErrorCode &errorCode) const override;
    602    /**
    603     * Appends the second string to the first string
    604     * (merging them at the boundary) and returns the first string.
    605     * The result is normalized if both the strings were normalized.
    606     * The first and second strings must be different objects.
    607     * @param first string, should be normalized
    608     * @param second string, should be normalized
    609     * @param errorCode Standard ICU error code. Its input value must
    610     *                  pass the U_SUCCESS() test, or else the function returns
    611     *                  immediately. Check for U_FAILURE() on output or use with
    612     *                  function chaining. (See User Guide for details.)
    613     * @return first
    614     * @stable ICU 4.4
    615     */
    616    virtual UnicodeString &
    617    append(UnicodeString &first,
    618           const UnicodeString &second,
    619           UErrorCode &errorCode) const override;
    620 
    621    /**
    622     * Gets the decomposition mapping of c.
    623     * For details see the base class documentation.
    624     *
    625     * This function is independent of the mode of the Normalizer2.
    626     * @param c code point
    627     * @param decomposition String object which will be set to c's
    628     *                      decomposition mapping, if there is one.
    629     * @return true if c has a decomposition, otherwise false
    630     * @stable ICU 4.6
    631     */
    632    virtual UBool
    633    getDecomposition(UChar32 c, UnicodeString &decomposition) const override;
    634 
    635    /**
    636     * Gets the raw decomposition mapping of c.
    637     * For details see the base class documentation.
    638     *
    639     * This function is independent of the mode of the Normalizer2.
    640     * @param c code point
    641     * @param decomposition String object which will be set to c's
    642     *                      raw decomposition mapping, if there is one.
    643     * @return true if c has a decomposition, otherwise false
    644     * @stable ICU 49
    645     */
    646    virtual UBool
    647    getRawDecomposition(UChar32 c, UnicodeString &decomposition) const override;
    648 
    649    /**
    650     * Performs pairwise composition of a & b and returns the composite if there is one.
    651     * For details see the base class documentation.
    652     *
    653     * This function is independent of the mode of the Normalizer2.
    654     * @param a A (normalization starter) code point.
    655     * @param b Another code point.
    656     * @return The non-negative composite code point if there is one; otherwise a negative value.
    657     * @stable ICU 49
    658     */
    659    virtual UChar32
    660    composePair(UChar32 a, UChar32 b) const override;
    661 
    662    /**
    663     * Gets the combining class of c.
    664     * The default implementation returns 0
    665     * but all standard implementations return the Unicode Canonical_Combining_Class value.
    666     * @param c code point
    667     * @return c's combining class
    668     * @stable ICU 49
    669     */
    670    virtual uint8_t
    671    getCombiningClass(UChar32 c) const override;
    672 
    673    /**
    674     * Tests if the string is normalized.
    675     * For details see the Normalizer2 base class documentation.
    676     * @param s input string
    677     * @param errorCode Standard ICU error code. Its input value must
    678     *                  pass the U_SUCCESS() test, or else the function returns
    679     *                  immediately. Check for U_FAILURE() on output or use with
    680     *                  function chaining. (See User Guide for details.)
    681     * @return true if s is normalized
    682     * @stable ICU 4.4
    683     */
    684    virtual UBool
    685    isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override;
    686    /**
    687     * Tests if the UTF-8 string is normalized.
    688     * Internally, in cases where the quickCheck() method would return "maybe"
    689     * (which is only possible for the two COMPOSE modes) this method
    690     * resolves to "yes" or "no" to provide a definitive result,
    691     * at the cost of doing more work in those cases.
    692     *
    693     * This works for all normalization modes.
    694     * It is optimized for UTF-8 for all built-in modes except for FCD.
    695     * The base class implementation converts to UTF-16 and calls isNormalized().
    696     *
    697     * @param s UTF-8 input string
    698     * @param errorCode Standard ICU error code. Its input value must
    699     *                  pass the U_SUCCESS() test, or else the function returns
    700     *                  immediately. Check for U_FAILURE() on output or use with
    701     *                  function chaining. (See User Guide for details.)
    702     * @return true if s is normalized
    703     * @stable ICU 60
    704     */
    705    virtual UBool
    706    isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const override;
    707    /**
    708     * Tests if the string is normalized.
    709     * For details see the Normalizer2 base class documentation.
    710     * @param s input string
    711     * @param errorCode Standard ICU error code. Its input value must
    712     *                  pass the U_SUCCESS() test, or else the function returns
    713     *                  immediately. Check for U_FAILURE() on output or use with
    714     *                  function chaining. (See User Guide for details.)
    715     * @return UNormalizationCheckResult
    716     * @stable ICU 4.4
    717     */
    718    virtual UNormalizationCheckResult
    719    quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override;
    720    /**
    721     * Returns the end of the normalized substring of the input string.
    722     * For details see the Normalizer2 base class documentation.
    723     * @param s input string
    724     * @param errorCode Standard ICU error code. Its input value must
    725     *                  pass the U_SUCCESS() test, or else the function returns
    726     *                  immediately. Check for U_FAILURE() on output or use with
    727     *                  function chaining. (See User Guide for details.)
    728     * @return "yes" span end index
    729     * @stable ICU 4.4
    730     */
    731    virtual int32_t
    732    spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const override;
    733 
    734    /**
    735     * Tests if the character always has a normalization boundary before it,
    736     * regardless of context.
    737     * For details see the Normalizer2 base class documentation.
    738     * @param c character to test
    739     * @return true if c has a normalization boundary before it
    740     * @stable ICU 4.4
    741     */
    742    virtual UBool hasBoundaryBefore(UChar32 c) const override;
    743 
    744    /**
    745     * Tests if the character always has a normalization boundary after it,
    746     * regardless of context.
    747     * For details see the Normalizer2 base class documentation.
    748     * @param c character to test
    749     * @return true if c has a normalization boundary after it
    750     * @stable ICU 4.4
    751     */
    752    virtual UBool hasBoundaryAfter(UChar32 c) const override;
    753 
    754    /**
    755     * Tests if the character is normalization-inert.
    756     * For details see the Normalizer2 base class documentation.
    757     * @param c character to test
    758     * @return true if c is normalization-inert
    759     * @stable ICU 4.4
    760     */
    761    virtual UBool isInert(UChar32 c) const override;
    762 private:
    763    UnicodeString &
    764    normalize(const UnicodeString &src,
    765              UnicodeString &dest,
    766              USetSpanCondition spanCondition,
    767              UErrorCode &errorCode) const;
    768 
    769    void
    770    normalizeUTF8(uint32_t options, const char *src, int32_t length,
    771                  ByteSink &sink, Edits *edits,
    772                  USetSpanCondition spanCondition,
    773                  UErrorCode &errorCode) const;
    774 
    775    UnicodeString &
    776    normalizeSecondAndAppend(UnicodeString &first,
    777                             const UnicodeString &second,
    778                             UBool doNormalize,
    779                             UErrorCode &errorCode) const;
    780 
    781    const Normalizer2 &norm2;
    782    const UnicodeSet &set;
    783 };
    784 
    785 U_NAMESPACE_END
    786 
    787 #endif  // !UCONFIG_NO_NORMALIZATION
    788 
    789 #endif /* U_SHOW_CPLUSPLUS_API */
    790 
    791 #endif  // __NORMALIZER2_H__
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE