tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

normalizer2impl.h (46144B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 2009-2014, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  normalizer2impl.h
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2009nov22
     16 *   created by: Markus W. Scherer
     17 */
     18 
     19 #ifndef __NORMALIZER2IMPL_H__
     20 #define __NORMALIZER2IMPL_H__
     21 
     22 #include "unicode/utypes.h"
     23 
     24 #if !UCONFIG_NO_NORMALIZATION
     25 
     26 #include "unicode/normalizer2.h"
     27 #include "unicode/ucptrie.h"
     28 #include "unicode/unistr.h"
     29 #include "unicode/unorm.h"
     30 #include "unicode/utf.h"
     31 #include "unicode/utf16.h"
     32 #include "mutex.h"
     33 #include "udataswp.h"
     34 #include "uset_imp.h"
     35 
     36 // When the nfc.nrm data is *not* hardcoded into the common library
     37 // (with this constant set to 0),
     38 // then it needs to be built into the data package:
     39 // Add nfc.nrm to icu4c/source/data/Makefile.in DAT_FILES_SHORT
     40 #define NORM2_HARDCODE_NFC_DATA 1
     41 
     42 U_NAMESPACE_BEGIN
     43 
     44 struct CanonIterData;
     45 
     46 class ByteSink;
     47 class Edits;
     48 class InitCanonIterData;
     49 class LcccContext;
     50 
     51 class U_COMMON_API Hangul {
     52 public:
     53    /* Korean Hangul and Jamo constants */
     54    enum {
     55        JAMO_L_BASE=0x1100,     /* "lead" jamo */
     56        JAMO_L_END=0x1112,
     57        JAMO_V_BASE=0x1161,     /* "vowel" jamo */
     58        JAMO_V_END=0x1175,
     59        JAMO_T_BASE=0x11a7,     /* "trail" jamo */
     60        JAMO_T_END=0x11c2,
     61 
     62        HANGUL_BASE=0xac00,
     63        HANGUL_END=0xd7a3,
     64 
     65        JAMO_L_COUNT=19,
     66        JAMO_V_COUNT=21,
     67        JAMO_T_COUNT=28,
     68 
     69        JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT,
     70 
     71        HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT,
     72        HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT
     73    };
     74 
     75    static inline UBool isHangul(UChar32 c) {
     76        return HANGUL_BASE<=c && c<HANGUL_LIMIT;
     77    }
     78    static inline UBool
     79    isHangulLV(UChar32 c) {
     80        c-=HANGUL_BASE;
     81        return 0<=c && c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
     82    }
     83    static inline UBool isJamoL(UChar32 c) {
     84        return static_cast<uint32_t>(c - JAMO_L_BASE) < JAMO_L_COUNT;
     85    }
     86    static inline UBool isJamoV(UChar32 c) {
     87        return static_cast<uint32_t>(c - JAMO_V_BASE) < JAMO_V_COUNT;
     88    }
     89    static inline UBool isJamoT(UChar32 c) {
     90        int32_t t=c-JAMO_T_BASE;
     91        return 0<t && t<JAMO_T_COUNT;  // not JAMO_T_BASE itself
     92    }
     93    static UBool isJamo(UChar32 c) {
     94        return JAMO_L_BASE<=c && c<=JAMO_T_END &&
     95            (c<=JAMO_L_END || (JAMO_V_BASE<=c && c<=JAMO_V_END) || JAMO_T_BASE<c);
     96    }
     97 
     98    /**
     99     * Decomposes c, which must be a Hangul syllable, into buffer
    100     * and returns the length of the decomposition (2 or 3).
    101     */
    102    static inline int32_t decompose(UChar32 c, char16_t buffer[3]) {
    103        c-=HANGUL_BASE;
    104        UChar32 c2=c%JAMO_T_COUNT;
    105        c/=JAMO_T_COUNT;
    106        buffer[0] = static_cast<char16_t>(JAMO_L_BASE + c / JAMO_V_COUNT);
    107        buffer[1] = static_cast<char16_t>(JAMO_V_BASE + c % JAMO_V_COUNT);
    108        if(c2==0) {
    109            return 2;
    110        } else {
    111            buffer[2] = static_cast<char16_t>(JAMO_T_BASE + c2);
    112            return 3;
    113        }
    114    }
    115 
    116    /**
    117     * Decomposes c, which must be a Hangul syllable, into buffer.
    118     * This is the raw, not recursive, decomposition. Its length is always 2.
    119     */
    120    static inline void getRawDecomposition(UChar32 c, char16_t buffer[2]) {
    121        UChar32 orig=c;
    122        c-=HANGUL_BASE;
    123        UChar32 c2=c%JAMO_T_COUNT;
    124        if(c2==0) {
    125            c/=JAMO_T_COUNT;
    126            buffer[0] = static_cast<char16_t>(JAMO_L_BASE + c / JAMO_V_COUNT);
    127            buffer[1] = static_cast<char16_t>(JAMO_V_BASE + c % JAMO_V_COUNT);
    128        } else {
    129            buffer[0] = static_cast<char16_t>(orig - c2); // LV syllable
    130            buffer[1] = static_cast<char16_t>(JAMO_T_BASE + c2);
    131        }
    132    }
    133 private:
    134    Hangul() = delete;  // no instantiation
    135 };
    136 
    137 class Normalizer2Impl;
    138 
    139 class U_COMMON_API ReorderingBuffer : public UMemory {
    140 public:
    141    /** Constructs only; init() should be called. */
    142    ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest) :
    143        impl(ni), str(dest),
    144        start(nullptr), reorderStart(nullptr), limit(nullptr),
    145        remainingCapacity(0), lastCC(0) {}
    146    /** Constructs, removes the string contents, and initializes for a small initial capacity. */
    147    ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest, UErrorCode &errorCode);
    148    ~ReorderingBuffer() {
    149        if (start != nullptr) {
    150            str.releaseBuffer(static_cast<int32_t>(limit - start));
    151        }
    152    }
    153    UBool init(int32_t destCapacity, UErrorCode &errorCode);
    154 
    155    UBool isEmpty() const { return start==limit; }
    156    int32_t length() const { return static_cast<int32_t>(limit - start); }
    157    char16_t *getStart() { return start; }
    158    char16_t *getLimit() { return limit; }
    159    uint8_t getLastCC() const { return lastCC; }
    160 
    161    UBool equals(const char16_t *start, const char16_t *limit) const;
    162    UBool equals(const uint8_t *otherStart, const uint8_t *otherLimit) const;
    163 
    164    UBool append(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
    165        return (c<=0xffff) ?
    166            appendBMP(static_cast<char16_t>(c), cc, errorCode) :
    167            appendSupplementary(c, cc, errorCode);
    168    }
    169    UBool append(const char16_t *s, int32_t length, UBool isNFD,
    170                 uint8_t leadCC, uint8_t trailCC,
    171                 UErrorCode &errorCode);
    172    UBool appendBMP(char16_t c, uint8_t cc, UErrorCode &errorCode) {
    173        if(remainingCapacity==0 && !resize(1, errorCode)) {
    174            return false;
    175        }
    176        if(lastCC<=cc || cc==0) {
    177            *limit++=c;
    178            lastCC=cc;
    179            if(cc<=1) {
    180                reorderStart=limit;
    181            }
    182        } else {
    183            insert(c, cc);
    184        }
    185        --remainingCapacity;
    186        return true;
    187    }
    188    UBool appendZeroCC(UChar32 c, UErrorCode &errorCode);
    189    UBool appendZeroCC(const char16_t *s, const char16_t *sLimit, UErrorCode &errorCode);
    190    void remove();
    191    void removeSuffix(int32_t suffixLength);
    192    void setReorderingLimit(char16_t *newLimit) {
    193        remainingCapacity += static_cast<int32_t>(limit - newLimit);
    194        reorderStart=limit=newLimit;
    195        lastCC=0;
    196    }
    197    void copyReorderableSuffixTo(UnicodeString &s) const {
    198        s.setTo(ConstChar16Ptr(reorderStart), static_cast<int32_t>(limit - reorderStart));
    199    }
    200 private:
    201    /*
    202     * TODO: Revisit whether it makes sense to track reorderStart.
    203     * It is set to after the last known character with cc<=1,
    204     * which stops previousCC() before it reads that character and looks up its cc.
    205     * previousCC() is normally only called from insert().
    206     * In other words, reorderStart speeds up the insertion of a combining mark
    207     * into a multi-combining mark sequence where it does not belong at the end.
    208     * This might not be worth the trouble.
    209     * On the other hand, it's not a huge amount of trouble.
    210     *
    211     * We probably need it for UNORM_SIMPLE_APPEND.
    212     */
    213 
    214    UBool appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode);
    215    void insert(UChar32 c, uint8_t cc);
    216    static void writeCodePoint(char16_t *p, UChar32 c) {
    217        if(c<=0xffff) {
    218            *p = static_cast<char16_t>(c);
    219        } else {
    220            p[0]=U16_LEAD(c);
    221            p[1]=U16_TRAIL(c);
    222        }
    223    }
    224    UBool resize(int32_t appendLength, UErrorCode &errorCode);
    225 
    226    const Normalizer2Impl &impl;
    227    UnicodeString &str;
    228    char16_t *start, *reorderStart, *limit;
    229    int32_t remainingCapacity;
    230    uint8_t lastCC;
    231 
    232    // private backward iterator
    233    void setIterator() { codePointStart=limit; }
    234    void skipPrevious();  // Requires start<codePointStart.
    235    uint8_t previousCC();  // Returns 0 if there is no previous character.
    236 
    237    char16_t *codePointStart, *codePointLimit;
    238 };
    239 
    240 /**
    241 * Low-level implementation of the Unicode Normalization Algorithm.
    242 * For the data structure and details see the documentation at the end of
    243 * this normalizer2impl.h and in the design doc at
    244 * https://unicode-org.github.io/icu/design/normalization/custom.html
    245 */
    246 class U_COMMON_API_CLASS Normalizer2Impl : public UObject {
    247 public:
    248    U_COMMON_API Normalizer2Impl() : normTrie(nullptr), fCanonIterData(nullptr) {}
    249    U_COMMON_API virtual ~Normalizer2Impl();
    250 
    251    U_COMMON_API void init(const int32_t* inIndexes,
    252                           const UCPTrie* inTrie,
    253                           const uint16_t* inExtraData,
    254                           const uint8_t* inSmallFCD);
    255 
    256    U_COMMON_API void addLcccChars(UnicodeSet& set) const;
    257    U_COMMON_API void addPropertyStarts(const USetAdder* sa, UErrorCode& errorCode) const;
    258    U_COMMON_API void addCanonIterPropertyStarts(const USetAdder* sa, UErrorCode& errorCode) const;
    259 
    260    // low-level properties ------------------------------------------------ ***
    261 
    262    U_COMMON_API UBool ensureCanonIterData(UErrorCode& errorCode) const;
    263 
    264    // The trie stores values for lead surrogate code *units*.
    265    // Surrogate code *points* are inert.
    266    U_COMMON_API uint16_t getNorm16(UChar32 c) const {
    267        return U_IS_LEAD(c) ?
    268            static_cast<uint16_t>(INERT) :
    269            UCPTRIE_FAST_GET(normTrie, UCPTRIE_16, c);
    270    }
    271    U_COMMON_API uint16_t getRawNorm16(UChar32 c) const {
    272        return UCPTRIE_FAST_GET(normTrie, UCPTRIE_16, c);
    273    }
    274 
    275    U_COMMON_API UNormalizationCheckResult getCompQuickCheck(uint16_t norm16) const {
    276        if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) {
    277            return UNORM_YES;
    278        } else if(minMaybeNo<=norm16) {
    279            return UNORM_MAYBE;
    280        } else {
    281            return UNORM_NO;
    282        }
    283    }
    284    U_COMMON_API UBool isAlgorithmicNoNo(uint16_t norm16) const {
    285        return limitNoNo <= norm16 && norm16 < minMaybeNo;
    286    }
    287    U_COMMON_API UBool isCompNo(uint16_t norm16) const {
    288        return minNoNo <= norm16 && norm16 < minMaybeNo;
    289    }
    290    U_COMMON_API UBool isDecompYes(uint16_t norm16) const {
    291        return norm16 < minYesNo || minMaybeYes <= norm16;
    292    }
    293 
    294    U_COMMON_API uint8_t getCC(uint16_t norm16) const {
    295        if(norm16>=MIN_NORMAL_MAYBE_YES) {
    296            return getCCFromNormalYesOrMaybe(norm16);
    297        }
    298        if(norm16<minNoNo || limitNoNo<=norm16) {
    299            return 0;
    300        }
    301        return getCCFromNoNo(norm16);
    302    }
    303    U_COMMON_API static uint8_t getCCFromNormalYesOrMaybe(uint16_t norm16) {
    304        return static_cast<uint8_t>(norm16 >> OFFSET_SHIFT);
    305    }
    306    U_COMMON_API static uint8_t getCCFromYesOrMaybeYes(uint16_t norm16) {
    307        return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0;
    308    }
    309    U_COMMON_API uint8_t getCCFromYesOrMaybeYesCP(UChar32 c) const {
    310        if (c < minCompNoMaybeCP) { return 0; }
    311        return getCCFromYesOrMaybeYes(getNorm16(c));
    312    }
    313 
    314    /**
    315     * Returns the FCD data for code point c.
    316     * @param c A Unicode code point.
    317     * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
    318     */
    319    U_COMMON_API uint16_t getFCD16(UChar32 c) const {
    320        if(c<minDecompNoCP) {
    321            return 0;
    322        } else if(c<=0xffff) {
    323            if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; }
    324        }
    325        return getFCD16FromNormData(c);
    326    }
    327    /**
    328     * Returns the FCD data for the next code point (post-increment).
    329     * Might skip only a lead surrogate rather than the whole surrogate pair if none of
    330     * the supplementary code points associated with the lead surrogate have non-zero FCD data.
    331     * @param s A valid pointer into a string. Requires s!=limit.
    332     * @param limit The end of the string, or NULL.
    333     * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
    334     */
    335    U_COMMON_API uint16_t nextFCD16(const char16_t*& s, const char16_t* limit) const {
    336        UChar32 c=*s++;
    337        if(c<minDecompNoCP || !singleLeadMightHaveNonZeroFCD16(c)) {
    338            return 0;
    339        }
    340        char16_t c2;
    341        if(U16_IS_LEAD(c) && s!=limit && U16_IS_TRAIL(c2=*s)) {
    342            c=U16_GET_SUPPLEMENTARY(c, c2);
    343            ++s;
    344        }
    345        return getFCD16FromNormData(c);
    346    }
    347    /**
    348     * Returns the FCD data for the previous code point (pre-decrement).
    349     * @param start The start of the string.
    350     * @param s A valid pointer into a string. Requires start<s.
    351     * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
    352     */
    353    U_COMMON_API uint16_t previousFCD16(const char16_t* start, const char16_t*& s) const {
    354        UChar32 c=*--s;
    355        if(c<minDecompNoCP) {
    356            return 0;
    357        }
    358        if(!U16_IS_TRAIL(c)) {
    359            if(!singleLeadMightHaveNonZeroFCD16(c)) {
    360                return 0;
    361            }
    362        } else {
    363            char16_t c2;
    364            if(start<s && U16_IS_LEAD(c2=*(s-1))) {
    365                c=U16_GET_SUPPLEMENTARY(c2, c);
    366                --s;
    367            }
    368        }
    369        return getFCD16FromNormData(c);
    370    }
    371 
    372    /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */
    373    U_COMMON_API UBool singleLeadMightHaveNonZeroFCD16(UChar32 lead) const {
    374        // 0<=lead<=0xffff
    375        uint8_t bits=smallFCD[lead>>8];
    376        if(bits==0) { return false; }
    377        return (bits >> ((lead >> 5) & 7)) & 1;
    378    }
    379    /** Returns the FCD value from the regular normalization data. */
    380    U_COMMON_API uint16_t getFCD16FromNormData(UChar32 c) const;
    381 
    382    U_COMMON_API uint16_t getFCD16FromMaybeOrNonZeroCC(uint16_t norm16) const;
    383 
    384    /**
    385     * Gets the decomposition for one code point.
    386     * @param c code point
    387     * @param buffer out-only buffer for algorithmic decompositions
    388     * @param length out-only, takes the length of the decomposition, if any
    389     * @return pointer to the decomposition, or NULL if none
    390     */
    391    U_COMMON_API const char16_t* getDecomposition(UChar32 c, char16_t buffer[4], int32_t& length) const;
    392 
    393    /**
    394     * Gets the raw decomposition for one code point.
    395     * @param c code point
    396     * @param buffer out-only buffer for algorithmic decompositions
    397     * @param length out-only, takes the length of the decomposition, if any
    398     * @return pointer to the decomposition, or NULL if none
    399     */
    400    U_COMMON_API const char16_t* getRawDecomposition(UChar32 c,
    401                                                     char16_t buffer[30],
    402                                                     int32_t& length) const;
    403 
    404    U_COMMON_API UChar32 composePair(UChar32 a, UChar32 b) const;
    405 
    406    U_COMMON_API UBool isCanonSegmentStarter(UChar32 c) const;
    407    U_COMMON_API UBool getCanonStartSet(UChar32 c, UnicodeSet& set) const;
    408 
    409    enum {
    410        // Fixed norm16 values.
    411        MIN_YES_YES_WITH_CC=0xfe02,
    412        JAMO_VT=0xfe00,
    413        MIN_NORMAL_MAYBE_YES=0xfc00,
    414        JAMO_L=2,  // offset=1 hasCompBoundaryAfter=false
    415        INERT=1,  // offset=0 hasCompBoundaryAfter=true
    416 
    417        // norm16 bit 0 is comp-boundary-after.
    418        HAS_COMP_BOUNDARY_AFTER=1,
    419        OFFSET_SHIFT=1,
    420 
    421        // For algorithmic one-way mappings, norm16 bits 2..1 indicate the
    422        // tccc (0, 1, >1) for quick FCC boundary-after tests.
    423        DELTA_TCCC_0=0,
    424        DELTA_TCCC_1=2,
    425        DELTA_TCCC_GT_1=4,
    426        DELTA_TCCC_MASK=6,
    427        DELTA_SHIFT=3,
    428 
    429        MAX_DELTA=0x40
    430    };
    431 
    432    enum {
    433        // Byte offsets from the start of the data, after the generic header.
    434        IX_NORM_TRIE_OFFSET,
    435        IX_EXTRA_DATA_OFFSET,
    436        IX_SMALL_FCD_OFFSET,
    437        IX_RESERVED3_OFFSET,
    438        IX_RESERVED4_OFFSET,
    439        IX_RESERVED5_OFFSET,
    440        IX_RESERVED6_OFFSET,
    441        IX_TOTAL_SIZE,
    442 
    443        // Code point thresholds for quick check codes.
    444        IX_MIN_DECOMP_NO_CP,
    445        IX_MIN_COMP_NO_MAYBE_CP,
    446 
    447        // Norm16 value thresholds for quick check combinations and types of extra data.
    448 
    449        /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */
    450        IX_MIN_YES_NO,
    451        /** Mappings are comp-normalized. */
    452        IX_MIN_NO_NO,
    453        IX_LIMIT_NO_NO,
    454        IX_MIN_MAYBE_YES,
    455 
    456        /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */
    457        IX_MIN_YES_NO_MAPPINGS_ONLY,
    458        /** Mappings are not comp-normalized but have a comp boundary before. */
    459        IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE,
    460        /** Mappings do not have a comp boundary before. */
    461        IX_MIN_NO_NO_COMP_NO_MAYBE_CC,
    462        /** Mappings to the empty string. */
    463        IX_MIN_NO_NO_EMPTY,
    464 
    465        IX_MIN_LCCC_CP,
    466        IX_RESERVED19,
    467 
    468        /** Two-way mappings; each starts with a character that combines backward. */
    469        IX_MIN_MAYBE_NO,  // 20
    470        /** Two-way mappings & compositions. */
    471        IX_MIN_MAYBE_NO_COMBINES_FWD,
    472 
    473        IX_COUNT  // 22
    474    };
    475 
    476    enum {
    477        MAPPING_HAS_CCC_LCCC_WORD=0x80,
    478        MAPPING_HAS_RAW_MAPPING=0x40,
    479        // unused bit 0x20,
    480        MAPPING_LENGTH_MASK=0x1f
    481    };
    482 
    483    enum {
    484        COMP_1_LAST_TUPLE=0x8000,
    485        COMP_1_TRIPLE=1,
    486        COMP_1_TRAIL_LIMIT=0x3400,
    487        COMP_1_TRAIL_MASK=0x7ffe,
    488        COMP_1_TRAIL_SHIFT=9,  // 10-1 for the "triple" bit
    489        COMP_2_TRAIL_SHIFT=6,
    490        COMP_2_TRAIL_MASK=0xffc0
    491    };
    492 
    493    // higher-level functionality ------------------------------------------ ***
    494 
    495    // NFD without an NFD Normalizer2 instance.
    496    U_COMMON_API UnicodeString& decompose(const UnicodeString& src,
    497                                          UnicodeString& dest,
    498                                          UErrorCode& errorCode) const;
    499    /**
    500     * Decomposes [src, limit[ and writes the result to dest.
    501     * limit can be NULL if src is NUL-terminated.
    502     * destLengthEstimate is the initial dest buffer capacity and can be -1.
    503     */
    504    U_COMMON_API void decompose(const char16_t* src,
    505                                const char16_t* limit,
    506                                UnicodeString& dest,
    507                                int32_t destLengthEstimate,
    508                                UErrorCode& errorCode) const;
    509 
    510    U_COMMON_API const char16_t* decompose(const char16_t* src,
    511                                           const char16_t* limit,
    512                                           ReorderingBuffer* buffer,
    513                                           UErrorCode& errorCode) const;
    514    U_COMMON_API void decomposeAndAppend(const char16_t* src,
    515                                         const char16_t* limit,
    516                                         UBool doDecompose,
    517                                         UnicodeString& safeMiddle,
    518                                         ReorderingBuffer& buffer,
    519                                         UErrorCode& errorCode) const;
    520 
    521    /** sink==nullptr: isNormalized()/spanQuickCheckYes() */
    522    U_COMMON_API const uint8_t* decomposeUTF8(uint32_t options,
    523                                              const uint8_t* src,
    524                                              const uint8_t* limit,
    525                                              ByteSink* sink,
    526                                              Edits* edits,
    527                                              UErrorCode& errorCode) const;
    528 
    529    U_COMMON_API UBool compose(const char16_t* src,
    530                               const char16_t* limit,
    531                               UBool onlyContiguous,
    532                               UBool doCompose,
    533                               ReorderingBuffer& buffer,
    534                               UErrorCode& errorCode) const;
    535    U_COMMON_API const char16_t* composeQuickCheck(const char16_t* src,
    536                                                   const char16_t* limit,
    537                                                   UBool onlyContiguous,
    538                                                   UNormalizationCheckResult* pQCResult) const;
    539    U_COMMON_API void composeAndAppend(const char16_t* src,
    540                                       const char16_t* limit,
    541                                       UBool doCompose,
    542                                       UBool onlyContiguous,
    543                                       UnicodeString& safeMiddle,
    544                                       ReorderingBuffer& buffer,
    545                                       UErrorCode& errorCode) const;
    546 
    547    /** sink==nullptr: isNormalized() */
    548    U_COMMON_API UBool composeUTF8(uint32_t options,
    549                                   UBool onlyContiguous,
    550                                   const uint8_t* src,
    551                                   const uint8_t* limit,
    552                                   ByteSink* sink,
    553                                   icu::Edits* edits,
    554                                   UErrorCode& errorCode) const;
    555 
    556    U_COMMON_API const char16_t* makeFCD(const char16_t* src,
    557                                         const char16_t* limit,
    558                                         ReorderingBuffer* buffer,
    559                                         UErrorCode& errorCode) const;
    560    U_COMMON_API void makeFCDAndAppend(const char16_t* src,
    561                                       const char16_t* limit,
    562                                       UBool doMakeFCD,
    563                                       UnicodeString& safeMiddle,
    564                                       ReorderingBuffer& buffer,
    565                                       UErrorCode& errorCode) const;
    566 
    567    U_COMMON_API UBool hasDecompBoundaryBefore(UChar32 c) const;
    568    U_COMMON_API UBool norm16HasDecompBoundaryBefore(uint16_t norm16) const;
    569    U_COMMON_API UBool hasDecompBoundaryAfter(UChar32 c) const;
    570    U_COMMON_API UBool norm16HasDecompBoundaryAfter(uint16_t norm16) const;
    571    U_COMMON_API UBool isDecompInert(UChar32 c) const { return isDecompYesAndZeroCC(getNorm16(c)); }
    572 
    573    U_COMMON_API UBool hasCompBoundaryBefore(UChar32 c) const {
    574        return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(getNorm16(c));
    575    }
    576    U_COMMON_API UBool hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous) const {
    577        return norm16HasCompBoundaryAfter(getNorm16(c), onlyContiguous);
    578    }
    579    U_COMMON_API UBool isCompInert(UChar32 c, UBool onlyContiguous) const {
    580        uint16_t norm16=getNorm16(c);
    581        return isCompYesAndZeroCC(norm16) &&
    582            (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 &&
    583            (!onlyContiguous || isInert(norm16) || *getDataForYesOrNo(norm16) <= 0x1ff);
    584            // The last check fetches the mapping's first unit and checks tccc<=1.
    585    }
    586 
    587    U_COMMON_API UBool hasFCDBoundaryBefore(UChar32 c) const { return hasDecompBoundaryBefore(c); }
    588    U_COMMON_API UBool hasFCDBoundaryAfter(UChar32 c) const { return hasDecompBoundaryAfter(c); }
    589    U_COMMON_API UBool isFCDInert(UChar32 c) const { return getFCD16(c) <= 1; }
    590 
    591  private:
    592    friend class InitCanonIterData;
    593    friend class LcccContext;
    594 
    595    UBool isMaybe(uint16_t norm16) const { return minMaybeNo<=norm16 && norm16<=JAMO_VT; }
    596    UBool isMaybeYesOrNonZeroCC(uint16_t norm16) const { return norm16>=minMaybeYes; }
    597    static UBool isInert(uint16_t norm16) { return norm16==INERT; }
    598    static UBool isJamoL(uint16_t norm16) { return norm16==JAMO_L; }
    599    static UBool isJamoVT(uint16_t norm16) { return norm16==JAMO_VT; }
    600    uint16_t hangulLVT() const { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; }
    601    UBool isHangulLV(uint16_t norm16) const { return norm16==minYesNo; }
    602    UBool isHangulLVT(uint16_t norm16) const {
    603        return norm16==hangulLVT();
    604    }
    605    UBool isCompYesAndZeroCC(uint16_t norm16) const { return norm16<minNoNo; }
    606    // UBool isCompYes(uint16_t norm16) const {
    607    //     return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
    608    // }
    609    // UBool isCompYesOrMaybe(uint16_t norm16) const {
    610    //     return norm16<minNoNo || minMaybeNo<=norm16;
    611    // }
    612    // UBool hasZeroCCFromDecompYes(uint16_t norm16) const {
    613    //     return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
    614    // }
    615    UBool isDecompYesAndZeroCC(uint16_t norm16) const {
    616        return norm16<minYesNo ||
    617               norm16==JAMO_VT ||
    618               (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES);
    619    }
    620    /**
    621     * A little faster and simpler than isDecompYesAndZeroCC() but does not include
    622     * the MaybeYes which combine-forward and have ccc=0.
    623     */
    624    UBool isMostDecompYesAndZeroCC(uint16_t norm16) const {
    625        return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
    626    }
    627    /** Since formatVersion 5: same as isAlgorithmicNoNo() */
    628    UBool isDecompNoAlgorithmic(uint16_t norm16) const { return limitNoNo<=norm16 && norm16<minMaybeNo; }
    629 
    630    // For use with isCompYes().
    631    // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
    632    // static uint8_t getCCFromYes(uint16_t norm16) {
    633    //     return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0;
    634    // }
    635    uint8_t getCCFromNoNo(uint16_t norm16) const {
    636        const uint16_t *mapping=getDataForYesOrNo(norm16);
    637        if(*mapping&MAPPING_HAS_CCC_LCCC_WORD) {
    638            return static_cast<uint8_t>(*(mapping - 1));
    639        } else {
    640            return 0;
    641        }
    642    }
    643    // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC()
    644    uint8_t getTrailCCFromCompYesAndZeroCC(uint16_t norm16) const {
    645        if(norm16<=minYesNo) {
    646            return 0;  // yesYes and Hangul LV have ccc=tccc=0
    647        } else {
    648            // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here.
    649            return static_cast<uint8_t>(*getDataForYesOrNo(norm16) >> 8); // tccc from yesNo
    650        }
    651    }
    652    uint8_t getPreviousTrailCC(const char16_t *start, const char16_t *p) const;
    653    uint8_t getPreviousTrailCC(const uint8_t *start, const uint8_t *p) const;
    654 
    655    // Requires algorithmic-NoNo.
    656    UChar32 mapAlgorithmic(UChar32 c, uint16_t norm16) const {
    657        return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta;
    658    }
    659    UChar32 getAlgorithmicDelta(uint16_t norm16) const {
    660        return (norm16>>DELTA_SHIFT)-centerNoNoDelta;
    661    }
    662 
    663    const uint16_t *getDataForYesOrNo(uint16_t norm16) const {
    664        return extraData+(norm16>>OFFSET_SHIFT);
    665    }
    666    const uint16_t *getDataForMaybe(uint16_t norm16) const {
    667        return extraData+((norm16-minMaybeNo+limitNoNo)>>OFFSET_SHIFT);
    668    }
    669    const uint16_t *getData(uint16_t norm16) const {
    670        if(norm16>=minMaybeNo) {
    671            norm16=norm16-minMaybeNo+limitNoNo;
    672        }
    673        return extraData+(norm16>>OFFSET_SHIFT);
    674    }
    675    const uint16_t *getCompositionsListForDecompYes(uint16_t norm16) const {
    676        if(norm16<JAMO_L || MIN_NORMAL_MAYBE_YES<=norm16) {
    677            return nullptr;
    678        } else {
    679            // if yesYes: if Jamo L: harmless empty list
    680            return getData(norm16);
    681        }
    682    }
    683    const uint16_t *getCompositionsListForComposite(uint16_t norm16) const {
    684        // A composite has both mapping & compositions list.
    685        const uint16_t *list=getData(norm16);
    686        return list+  // mapping pointer
    687            1+  // +1 to skip the first unit with the mapping length
    688            (*list&MAPPING_LENGTH_MASK);  // + mapping length
    689    }
    690    /**
    691     * @param c code point must have compositions
    692     * @return compositions list pointer
    693     */
    694    const uint16_t *getCompositionsList(uint16_t norm16) const {
    695        return isDecompYes(norm16) ?
    696                getCompositionsListForDecompYes(norm16) :
    697                getCompositionsListForComposite(norm16);
    698    }
    699 
    700    const char16_t *copyLowPrefixFromNulTerminated(const char16_t *src,
    701                                                UChar32 minNeedDataCP,
    702                                                ReorderingBuffer *buffer,
    703                                                UErrorCode &errorCode) const;
    704 
    705    enum StopAt { STOP_AT_LIMIT, STOP_AT_DECOMP_BOUNDARY, STOP_AT_COMP_BOUNDARY };
    706 
    707    const char16_t *decomposeShort(const char16_t *src, const char16_t *limit,
    708                                UBool stopAtCompBoundary, UBool onlyContiguous,
    709                                ReorderingBuffer &buffer, UErrorCode &errorCode) const;
    710    UBool decompose(UChar32 c, uint16_t norm16,
    711                    ReorderingBuffer &buffer, UErrorCode &errorCode) const;
    712 
    713    const uint8_t *decomposeShort(const uint8_t *src, const uint8_t *limit,
    714                                  StopAt stopAt, UBool onlyContiguous,
    715                                  ReorderingBuffer &buffer, UErrorCode &errorCode) const;
    716 
    717    static int32_t combine(const uint16_t *list, UChar32 trail);
    718    void addComposites(const uint16_t *list, UnicodeSet &set) const;
    719    void recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
    720                   UBool onlyContiguous) const;
    721 
    722    UBool hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const {
    723        return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(norm16);
    724    }
    725    UBool norm16HasCompBoundaryBefore(uint16_t norm16) const  {
    726        return norm16 < minNoNoCompNoMaybeCC || isAlgorithmicNoNo(norm16);
    727    }
    728    UBool hasCompBoundaryBefore(const char16_t *src, const char16_t *limit) const;
    729    UBool hasCompBoundaryBefore(const uint8_t *src, const uint8_t *limit) const;
    730    UBool hasCompBoundaryAfter(const char16_t *start, const char16_t *p,
    731                               UBool onlyContiguous) const;
    732    UBool hasCompBoundaryAfter(const uint8_t *start, const uint8_t *p,
    733                               UBool onlyContiguous) const;
    734    UBool norm16HasCompBoundaryAfter(uint16_t norm16, UBool onlyContiguous) const {
    735        return (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 &&
    736            (!onlyContiguous || isTrailCC01ForCompBoundaryAfter(norm16));
    737    }
    738    /** For FCC: Given norm16 HAS_COMP_BOUNDARY_AFTER, does it have tccc<=1? */
    739    UBool isTrailCC01ForCompBoundaryAfter(uint16_t norm16) const {
    740        return isInert(norm16) || (isDecompNoAlgorithmic(norm16) ?
    741            (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1 : *getDataForYesOrNo(norm16) <= 0x1ff);
    742    }
    743 
    744    const char16_t *findPreviousCompBoundary(const char16_t *start, const char16_t *p,
    745                                             UBool onlyContiguous) const;
    746    const char16_t *findNextCompBoundary(const char16_t *p, const char16_t *limit,
    747                                         UBool onlyContiguous) const;
    748 
    749    const char16_t *findPreviousFCDBoundary(const char16_t *start, const char16_t *p) const;
    750    const char16_t *findNextFCDBoundary(const char16_t *p, const char16_t *limit) const;
    751 
    752    void makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, const uint16_t norm16,
    753                                     CanonIterData &newData, UErrorCode &errorCode) const;
    754 
    755    int32_t getCanonValue(UChar32 c) const;
    756    const UnicodeSet &getCanonStartSet(int32_t n) const;
    757 
    758    // UVersionInfo dataVersion;
    759 
    760    // BMP code point thresholds for quick check loops looking at single UTF-16 code units.
    761    char16_t minDecompNoCP;
    762    char16_t minCompNoMaybeCP;
    763    char16_t minLcccCP;
    764 
    765    // Norm16 value thresholds for quick check combinations and types of extra data.
    766    uint16_t minYesNo;
    767    uint16_t minYesNoMappingsOnly;
    768    uint16_t minNoNo;
    769    uint16_t minNoNoCompBoundaryBefore;
    770    uint16_t minNoNoCompNoMaybeCC;
    771    uint16_t minNoNoEmpty;
    772    uint16_t limitNoNo;
    773    uint16_t centerNoNoDelta;
    774    uint16_t minMaybeNo;
    775    uint16_t minMaybeNoCombinesFwd;
    776    uint16_t minMaybeYes;
    777 
    778    const UCPTrie *normTrie;
    779    const uint16_t *extraData;  // mappings and/or compositions
    780    const uint8_t *smallFCD;  // [0x100] one bit per 32 BMP code points, set if any FCD!=0
    781 
    782    UInitOnce       fCanonIterDataInitOnce {};
    783    CanonIterData  *fCanonIterData;
    784 };
    785 
    786 // bits in canonIterData
    787 #define CANON_NOT_SEGMENT_STARTER 0x80000000
    788 #define CANON_HAS_COMPOSITIONS 0x40000000
    789 #define CANON_HAS_SET 0x200000
    790 #define CANON_VALUE_MASK 0x1fffff
    791 
    792 /**
    793 * ICU-internal shortcut for quick access to standard Unicode normalization.
    794 */
    795 class U_COMMON_API Normalizer2Factory {
    796 public:
    797    static const Normalizer2 *getFCDInstance(UErrorCode &errorCode);
    798    static const Normalizer2 *getFCCInstance(UErrorCode &errorCode);
    799    static const Normalizer2 *getNoopInstance(UErrorCode &errorCode);
    800 
    801    static const Normalizer2 *getInstance(UNormalizationMode mode, UErrorCode &errorCode);
    802 
    803    static const Normalizer2Impl *getNFCImpl(UErrorCode &errorCode);
    804    static const Normalizer2Impl *getNFKCImpl(UErrorCode &errorCode);
    805    static const Normalizer2Impl *getNFKC_CFImpl(UErrorCode &errorCode);
    806 
    807    // Get the Impl instance of the Normalizer2.
    808    // Must be used only when it is known that norm2 is a Normalizer2WithImpl instance.
    809    static const Normalizer2Impl *getImpl(const Normalizer2 *norm2);
    810 private:
    811    Normalizer2Factory() = delete;  // No instantiation.
    812 };
    813 
    814 U_NAMESPACE_END
    815 
    816 U_CAPI int32_t U_EXPORT2
    817 unorm2_swap(const UDataSwapper *ds,
    818            const void *inData, int32_t length, void *outData,
    819            UErrorCode *pErrorCode);
    820 
    821 /**
    822 * Get the NF*_QC property for a code point, for u_getIntPropertyValue().
    823 * @internal
    824 */
    825 U_CFUNC UNormalizationCheckResult
    826 unorm_getQuickCheck(UChar32 c, UNormalizationMode mode);
    827 
    828 /**
    829 * Gets the 16-bit FCD value (lead & trail CCs) for a code point, for u_getIntPropertyValue().
    830 * @internal
    831 */
    832 U_CFUNC uint16_t
    833 unorm_getFCD16(UChar32 c);
    834 
    835 /**
    836 * Format of Normalizer2 .nrm data files.
    837 * Format version 5.0.
    838 *
    839 * Normalizer2 .nrm data files provide data for the Unicode Normalization algorithms.
    840 * ICU ships with data files for standard Unicode Normalization Forms
    841 * NFC and NFD (nfc.nrm), NFKC and NFKD (nfkc.nrm),
    842 * NFKC_Casefold (nfkc_cf.nrm) and NFKC_Simple_Casefold (nfkc_scf.nrm).
    843 * Custom (application-specific) data can be built into additional .nrm files
    844 * with the gennorm2 build tool.
    845 * ICU ships with one such file, uts46.nrm, for the implementation of UTS #46.
    846 *
    847 * Normalizer2.getInstance() causes a .nrm file to be loaded, unless it has been
    848 * cached already. Internally, Normalizer2Impl.load() reads the .nrm file.
    849 *
    850 * A .nrm file begins with a standard ICU data file header
    851 * (DataHeader, see ucmndata.h and unicode/udata.h).
    852 * The UDataInfo.dataVersion field usually contains the Unicode version
    853 * for which the data was generated.
    854 *
    855 * After the header, the file contains the following parts.
    856 * Constants are defined as enum values of the Normalizer2Impl class.
    857 *
    858 * Many details of the data structures are described in the design doc
    859 * which is at https://unicode-org.github.io/icu/design/normalization/custom.html
    860 *
    861 * int32_t indexes[indexesLength]; -- indexesLength=indexes[IX_NORM_TRIE_OFFSET]/4;
    862 *
    863 *      The first eight indexes are byte offsets in ascending order.
    864 *      Each byte offset marks the start of the next part in the data file,
    865 *      and the end of the previous one.
    866 *      When two consecutive byte offsets are the same, then the corresponding part is empty.
    867 *      Byte offsets are offsets from after the header,
    868 *      that is, from the beginning of the indexes[].
    869 *      Each part starts at an offset with proper alignment for its data.
    870 *      If necessary, the previous part may include padding bytes to achieve this alignment.
    871 *
    872 *      minDecompNoCP=indexes[IX_MIN_DECOMP_NO_CP] is the lowest code point
    873 *      with a decomposition mapping, that is, with NF*D_QC=No.
    874 *      minCompNoMaybeCP=indexes[IX_MIN_COMP_NO_MAYBE_CP] is the lowest code point
    875 *      with NF*C_QC=No (has a one-way mapping) or Maybe (combines backward).
    876 *      minLcccCP=indexes[IX_MIN_LCCC_CP] (index 18, new in formatVersion 3)
    877 *      is the lowest code point with lccc!=0.
    878 *
    879 *      The next eight indexes are thresholds of 16-bit trie values for ranges of
    880 *      values indicating multiple normalization properties.
    881 *      Format version 5 adds the two minMaybeNo* threshold indexes.
    882 *      The thresholds are listed here in threshold order,
    883 *      not in the order they are stored in the indexes.
    884 *          minYesNo=indexes[IX_MIN_YES_NO];
    885 *          minYesNoMappingsOnly=indexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
    886 *          minNoNo=indexes[IX_MIN_NO_NO];
    887 *          minNoNoCompBoundaryBefore=indexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE];
    888 *          minNoNoCompNoMaybeCC=indexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC];
    889 *          minNoNoEmpty=indexes[IX_MIN_NO_NO_EMPTY];
    890 *          limitNoNo=indexes[IX_LIMIT_NO_NO];
    891 *          minMaybeNo=indexes[IX_MIN_MAYBE_NO];
    892 *          minMaybeNoCombinesFwd=indexes[IX_MIN_MAYBE_NO_COMBINES_FWD];
    893 *          minMaybeYes=indexes[IX_MIN_MAYBE_YES];
    894 *      See the normTrie description below and the design doc for details.
    895 *
    896 * UCPTrie normTrie; -- see ucptrie_impl.h and ucptrie.h, same as Java CodePointTrie
    897 *
    898 *      The trie holds the main normalization data. Each code point is mapped to a 16-bit value.
    899 *      Rather than using independent bits in the value (which would require more than 16 bits),
    900 *      information is extracted primarily via range checks.
    901 *      Except, format version 3+ uses bit 0 for hasCompBoundaryAfter().
    902 *      For example, a 16-bit value norm16 in the range minYesNo<=norm16<minNoNo
    903 *      means that the character has NF*C_QC=Yes and NF*D_QC=No properties,
    904 *      which means it has a two-way (round-trip) decomposition mapping.
    905 *      Values in the ranges 2<=norm16<limitNoNo and minMaybeNo<=norm16<minMaybeYes
    906 *      are also directly indexes into the extraData
    907 *      pointing to mappings, compositions lists, or both.
    908 *      Value norm16==INERT (0 in versions 1 & 2, 1 in version 3+)
    909 *      means that the character is normalization-inert, that is,
    910 *      it does not have a mapping, does not participate in composition, has a zero
    911 *      canonical combining class, and forms a boundary where text before it and after it
    912 *      can be normalized independently.
    913 *      For details about how multiple properties are encoded in 16-bit values
    914 *      see the design doc.
    915 *      Note that the encoding cannot express all combinations of the properties involved;
    916 *      it only supports those combinations that are allowed by
    917 *      the Unicode Normalization algorithms. Details are in the design doc as well.
    918 *      The gennorm2 tool only builds .nrm files for data that conforms to the limitations.
    919 *
    920 *      The trie has a value for each lead surrogate code unit representing the "worst case"
    921 *      properties of the 1024 supplementary characters whose UTF-16 form starts with
    922 *      the lead surrogate. If all of the 1024 supplementary characters are normalization-inert,
    923 *      then their lead surrogate code unit has the trie value INERT.
    924 *      When the lead surrogate unit's value exceeds the quick check minimum during processing,
    925 *      the properties for the full supplementary code point need to be looked up.
    926 *
    927 * uint16_t extraData[];
    928 *
    929 *      The extraData array contains many per-character data sections.
    930 *      Each section contains mappings and/or composition lists.
    931 *      The norm16 value of each character that has such data is directly an index to
    932 *      a section of the extraData array.
    933 *
    934 *      In version 3+, the norm16 values must be shifted right by OFFSET_SHIFT
    935 *      for accessing extraData.
    936 *
    937 *      The data structures for compositions lists and mappings are described in the design doc.
    938 *
    939 *      In version 4 and below, the composition lists for MaybeYes characters were stored before
    940 *      the data for other characters.
    941 *      This sub-array had a length of MIN_NORMAL_MAYBE_YES-minMaybeYes.
    942 *      In version 3 & 4, the difference must be shifted right by OFFSET_SHIFT.
    943 *
    944 *      In version 5, the data for MaybeNo and MaybeYes characters is stored after
    945 *      the data for other characters.
    946 *
    947 *      If there are no MaybeNo and no MaybeYes characters,
    948 *      then minMaybeYes==minMaybeNo==MIN_NORMAL_MAYBE_YES.
    949 *      If there are such characters, then minMaybeNo is subtracted from their norm16 values
    950 *      to get the index into the extraData.
    951 *      In version 4 and below, the data index for Yes* and No* characters needs to be
    952 *      offset by the length of the MaybeYes data.
    953 *      In version 5, the data index for Maybe* characters needs to be offset by limitNoNo.
    954 *
    955 *      Version 5 is the first to support MaybeNo characters, and
    956 *      adds the minMaybeNo and minMaybeNoCombinesFwd thresholds and
    957 *      the corresponding sections of the extraData.
    958 *
    959 * uint8_t smallFCD[0x100]; -- new in format version 2
    960 *
    961 *      This is a bit set to help speed up FCD value lookups in the absence of a full
    962 *      UTrie2 or other large data structure with the full FCD value mapping.
    963 *
    964 *      Each smallFCD bit is set if any of the corresponding 32 BMP code points
    965 *      has a non-zero FCD value (lccc!=0 or tccc!=0).
    966 *      Bit 0 of smallFCD[0] is for U+0000..U+001F. Bit 7 of smallFCD[0xff] is for U+FFE0..U+FFFF.
    967 *      A bit for 32 lead surrogates is set if any of the 32k corresponding
    968 *      _supplementary_ code points has a non-zero FCD value.
    969 *
    970 *      This bit set is most useful for the large blocks of CJK characters with FCD=0.
    971 *
    972 * Changes from format version 1 to format version 2 ---------------------------
    973 *
    974 * - Addition of data for raw (not recursively decomposed) mappings.
    975 *   + The MAPPING_NO_COMP_BOUNDARY_AFTER bit in the extraData is now also set when
    976 *     the mapping is to an empty string or when the character combines-forward.
    977 *     This subsumes the one actual use of the MAPPING_PLUS_COMPOSITION_LIST bit which
    978 *     is then repurposed for the MAPPING_HAS_RAW_MAPPING bit.
    979 *   + For details see the design doc.
    980 * - Addition of indexes[IX_MIN_YES_NO_MAPPINGS_ONLY] and separation of the yesNo extraData into
    981 *   distinct ranges (combines-forward vs. not)
    982 *   so that a range check can be used to find out if there is a compositions list.
    983 *   This is fully equivalent with formatVersion 1's MAPPING_PLUS_COMPOSITION_LIST flag.
    984 *   It is needed for the new (in ICU 49) composePair(), not for other normalization.
    985 * - Addition of the smallFCD[] bit set.
    986 *
    987 * Changes from format version 2 to format version 3 (ICU 60) ------------------
    988 *
    989 * - norm16 bit 0 indicates hasCompBoundaryAfter(),
    990 *   except that for contiguous composition (FCC) the tccc must be checked as well.
    991 *   Data indexes and ccc values are shifted left by one (OFFSET_SHIFT).
    992 *   Thresholds like minNoNo are tested before shifting.
    993 *
    994 * - Algorithmic mapping deltas are shifted left by two more bits (total DELTA_SHIFT),
    995 *   to make room for two bits (three values) indicating whether the tccc is 0, 1, or greater.
    996 *   See DELTA_TCCC_MASK etc.
    997 *   This helps with fetching tccc/FCD values and FCC hasCompBoundaryAfter().
    998 *   minMaybeNo is 8-aligned so that the DELTA_TCCC_MASK bits can be tested directly.
    999 *
   1000 * - Algorithmic mappings are only used for mapping to "comp yes and ccc=0" characters,
   1001 *   and ASCII characters are mapped algorithmically only to other ASCII characters.
   1002 *   This helps with hasCompBoundaryBefore() and compose() fast paths.
   1003 *   It is never necessary any more to loop for algorithmic mappings.
   1004 *
   1005 * - Addition of indexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE],
   1006 *   indexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC], and indexes[IX_MIN_NO_NO_EMPTY],
   1007 *   and separation of the noNo extraData into distinct ranges.
   1008 *   With this, the noNo norm16 value indicates whether the mapping is
   1009 *   compose-normalized, not normalized but hasCompBoundaryBefore(),
   1010 *   not even that, or maps to an empty string.
   1011 *   hasCompBoundaryBefore() can be determined solely from the norm16 value.
   1012 *
   1013 * - The norm16 value for Hangul LVT is now different from that for Hangul LV,
   1014 *   so that hasCompBoundaryAfter() need not check for the syllable type.
   1015 *   For Hangul LV, minYesNo continues to be used (no comp-boundary-after).
   1016 *   For Hangul LVT, minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER is used.
   1017 *   The extraData units at these indexes are set to firstUnit=2 and firstUnit=3, respectively,
   1018 *   to simplify some code.
   1019 *
   1020 * - The extraData firstUnit bit 5 is no longer necessary
   1021 *   (norm16 bit 0 used instead of firstUnit MAPPING_NO_COMP_BOUNDARY_AFTER),
   1022 *   is reserved again, and always set to 0.
   1023 *
   1024 * - Addition of indexes[IX_MIN_LCCC_CP], the first code point where lccc!=0.
   1025 *   This used to be hardcoded to U+0300, but in data like NFKC_Casefold it is lower:
   1026 *   U+00AD Soft Hyphen maps to an empty string,
   1027 *   which is artificially assigned "worst case" values lccc=1 and tccc=255.
   1028 *
   1029 * - A mapping to an empty string has explicit lccc=1 and tccc=255 values.
   1030 *
   1031 * Changes from format version 3 to format version 4 (ICU 63) ------------------
   1032 *
   1033 * Switched from UTrie2 to UCPTrie/CodePointTrie.
   1034 *
   1035 * The new trie no longer stores different values for surrogate code *units* vs.
   1036 * surrogate code *points*.
   1037 * Lead surrogates still have values for optimized UTF-16 string processing.
   1038 * When looking up code point properties, the code now checks for lead surrogates and
   1039 * treats them as inert.
   1040 *
   1041 * gennorm2 now has to reject mappings for surrogate code points.
   1042 * UTS #46 maps unpaired surrogates to U+FFFD in code rather than via its
   1043 * custom normalization data file.
   1044 *
   1045 * Changes from format version 4 to format version 5 (ICU 76) ------------------
   1046 *
   1047 * Unicode 16 adds the first MaybeYes characters which combine both backward and forward,
   1048 * taking this formerly theoretical data structure into reality.
   1049 *
   1050 * Unicode 16 also adds the first characters that have two-way mappings whose first characters
   1051 * combine backward. In order for normalization and the quick check to work properly,
   1052 * these composite characters also must be marked as NFC_QC=Maybe,
   1053 * corresponding to "combines back", although the composites themselves do not combine backward.
   1054 * Format version 5 adds two new ranges between "algorithmic NoNo" and MaybeYes,
   1055 * with thresholds minMaybeNo and minMaybeNoCombinesFwd,
   1056 * and indexes[IX_MIN_MAYBE_NO] and indexes[IX_MIN_MAYBE_NO_COMBINES_FWD],
   1057 * and corresponding mappings and composition lists in the extraData.
   1058 *
   1059 * Format version 5 moves the data for Maybe* characters from the start of the extraData array
   1060 * to its end.
   1061 */
   1062 
   1063 #endif  /* !UCONFIG_NO_NORMALIZATION */
   1064 #endif  /* __NORMALIZER2IMPL_H__ */