tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

collationdatabuilder.cpp (72338B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2012-2015, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * collationdatabuilder.cpp
      9 *
     10 * (replaced the former ucol_elm.cpp)
     11 *
     12 * created on: 2012apr01
     13 * created by: Markus W. Scherer
     14 */
     15 
     16 #include "unicode/utypes.h"
     17 
     18 #if !UCONFIG_NO_COLLATION
     19 
     20 #include "unicode/localpointer.h"
     21 #include "unicode/uchar.h"
     22 #include "unicode/ucharstrie.h"
     23 #include "unicode/ucharstriebuilder.h"
     24 #include "unicode/uniset.h"
     25 #include "unicode/unistr.h"
     26 #include "unicode/usetiter.h"
     27 #include "unicode/utf16.h"
     28 #include "cmemory.h"
     29 #include "collation.h"
     30 #include "collationdata.h"
     31 #include "collationdatabuilder.h"
     32 #include "collationfastlatinbuilder.h"
     33 #include "collationiterator.h"
     34 #include "normalizer2impl.h"
     35 #include "utrie2.h"
     36 #include "uvectr32.h"
     37 #include "uvectr64.h"
     38 #include "uvector.h"
     39 
     40 U_NAMESPACE_BEGIN
     41 
     42 CollationDataBuilder::CEModifier::~CEModifier() {}
     43 
     44 /**
     45 * Build-time context and CE32 for a code point.
     46 * If a code point has contextual mappings, then the default (no-context) mapping
     47 * and all conditional mappings are stored in a singly-linked list
     48 * of ConditionalCE32, sorted by context strings.
     49 *
     50 * Context strings sort by prefix length, then by prefix, then by contraction suffix.
     51 * Context strings must be unique and in ascending order.
     52 */
     53 struct ConditionalCE32 : public UMemory {
     54    ConditionalCE32()
     55            : context(),
     56              ce32(0), defaultCE32(Collation::NO_CE32), builtCE32(Collation::NO_CE32),
     57              next(-1) {}
     58    ConditionalCE32(const UnicodeString &ct, uint32_t ce)
     59            : context(ct),
     60              ce32(ce), defaultCE32(Collation::NO_CE32), builtCE32(Collation::NO_CE32),
     61              next(-1) {}
     62 
     63    inline UBool hasContext() const { return context.length() > 1; }
     64    inline int32_t prefixLength() const { return context.charAt(0); }
     65 
     66    /**
     67     * "\0" for the first entry for any code point, with its default CE32.
     68     *
     69     * Otherwise one unit with the length of the prefix string,
     70     * then the prefix string, then the contraction suffix.
     71     */
     72    UnicodeString context;
     73    /**
     74     * CE32 for the code point and its context.
     75     * Can be special (e.g., for an expansion) but not contextual (prefix or contraction tag).
     76     */
     77    uint32_t ce32;
     78    /**
     79     * Default CE32 for all contexts with this same prefix.
     80     * Initially NO_CE32. Set only while building runtime data structures,
     81     * and only on one of the nodes of a sub-list with the same prefix.
     82     */
     83    uint32_t defaultCE32;
     84    /**
     85     * CE32 for the built contexts.
     86     * When fetching CEs from the builder, the contexts are built into their runtime form
     87     * so that the normal collation implementation can process them.
     88     * The result is cached in the list head. It is reset when the contexts are modified.
     89     * All of these builtCE32 are invalidated by clearContexts(),
     90     * via incrementing the contextsEra.
     91     */
     92    uint32_t builtCE32;
     93    /**
     94     * The "era" of building intermediate contexts when the above builtCE32 was set.
     95     * When the array of cached, temporary contexts overflows, then clearContexts()
     96     * removes them all and invalidates the builtCE32 that used to point to built tries.
     97     */
     98    int32_t era = -1;
     99    /**
    100     * Index of the next ConditionalCE32.
    101     * Negative for the end of the list.
    102     */
    103    int32_t next;
    104    // Note: We could create a separate class for all of the contextual mappings for
    105    // a code point, with the builtCE32, the era, and a list of the actual mappings.
    106    // The class that represents one mapping would then not need to
    107    // store those fields in each element.
    108 };
    109 
    110 U_CDECL_BEGIN
    111 
    112 void U_CALLCONV
    113 uprv_deleteConditionalCE32(void *obj) {
    114    delete static_cast<ConditionalCE32 *>(obj);
    115 }
    116 
    117 U_CDECL_END
    118 
    119 /**
    120 * Build-time collation element and character iterator.
    121 * Uses the runtime CollationIterator for fetching CEs for a string
    122 * but reads from the builder's unfinished data structures.
    123 * In particular, this class reads from the unfinished trie
    124 * and has to avoid CollationIterator::nextCE() and redirect other
    125 * calls to data->getCE32() and data->getCE32FromSupplementary().
    126 *
    127 * We do this so that we need not implement the collation algorithm
    128 * again for the builder and make it behave exactly like the runtime code.
    129 * That would be more difficult to test and maintain than this indirection.
    130 *
    131 * Some CE32 tags (for example, the DIGIT_TAG) do not occur in the builder data,
    132 * so the data accesses from those code paths need not be modified.
    133 *
    134 * This class iterates directly over whole code points
    135 * so that the CollationIterator does not need the finished trie
    136 * for handling the LEAD_SURROGATE_TAG.
    137 */
    138 class DataBuilderCollationIterator : public CollationIterator {
    139 public:
    140    DataBuilderCollationIterator(CollationDataBuilder &b);
    141 
    142    virtual ~DataBuilderCollationIterator();
    143 
    144    int32_t fetchCEs(const UnicodeString &str, int32_t start, int64_t ces[], int32_t cesLength);
    145 
    146    virtual void resetToOffset(int32_t newOffset) override;
    147    virtual int32_t getOffset() const override;
    148 
    149    virtual UChar32 nextCodePoint(UErrorCode &errorCode) override;
    150    virtual UChar32 previousCodePoint(UErrorCode &errorCode) override;
    151 
    152 protected:
    153    virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
    154    virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
    155 
    156    virtual uint32_t getDataCE32(UChar32 c) const override;
    157    virtual uint32_t getCE32FromBuilderData(uint32_t ce32, UErrorCode &errorCode) override;
    158 
    159    CollationDataBuilder &builder;
    160    CollationData builderData;
    161    uint32_t jamoCE32s[CollationData::JAMO_CE32S_LENGTH];
    162    const UnicodeString *s;
    163    int32_t pos;
    164 };
    165 
    166 DataBuilderCollationIterator::DataBuilderCollationIterator(CollationDataBuilder &b)
    167        : CollationIterator(&builderData, /*numeric=*/ false),
    168          builder(b), builderData(b.nfcImpl),
    169          s(nullptr), pos(0) {
    170    builderData.base = builder.base;
    171    // Set all of the jamoCE32s[] to indirection CE32s.
    172    for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) {  // Count across Jamo types.
    173        UChar32 jamo = CollationDataBuilder::jamoCpFromIndex(j);
    174        jamoCE32s[j] = Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG, jamo) |
    175                CollationDataBuilder::IS_BUILDER_JAMO_CE32;
    176    }
    177    builderData.jamoCE32s = jamoCE32s;
    178 }
    179 
    180 DataBuilderCollationIterator::~DataBuilderCollationIterator() {}
    181 
    182 int32_t
    183 DataBuilderCollationIterator::fetchCEs(const UnicodeString &str, int32_t start,
    184                                       int64_t ces[], int32_t cesLength) {
    185    // Set the pointers each time, in case they changed due to reallocation.
    186    builderData.ce32s = reinterpret_cast<const uint32_t *>(builder.ce32s.getBuffer());
    187    builderData.ces = builder.ce64s.getBuffer();
    188    builderData.contexts = builder.contexts.getBuffer();
    189    // Modified copy of CollationIterator::nextCE() and CollationIterator::nextCEFromCE32().
    190    reset();
    191    s = &str;
    192    pos = start;
    193    UErrorCode errorCode = U_ZERO_ERROR;
    194    while(U_SUCCESS(errorCode) && pos < s->length()) {
    195        // No need to keep all CEs in the iterator buffer.
    196        clearCEs();
    197        UChar32 c = s->char32At(pos);
    198        pos += U16_LENGTH(c);
    199        uint32_t ce32 = utrie2_get32(builder.trie, c);
    200        const CollationData *d;
    201        if(ce32 == Collation::FALLBACK_CE32) {
    202            d = builder.base;
    203            ce32 = builder.base->getCE32(c);
    204        } else {
    205            d = &builderData;
    206        }
    207        appendCEsFromCE32(d, c, ce32, /*forward=*/ true, errorCode);
    208        U_ASSERT(U_SUCCESS(errorCode));
    209        for(int32_t i = 0; i < getCEsLength(); ++i) {
    210            int64_t ce = getCE(i);
    211            if(ce != 0) {
    212                if(cesLength < Collation::MAX_EXPANSION_LENGTH) {
    213                    ces[cesLength] = ce;
    214                }
    215                ++cesLength;
    216            }
    217        }
    218    }
    219    return cesLength;
    220 }
    221 
    222 void
    223 DataBuilderCollationIterator::resetToOffset(int32_t newOffset) {
    224    reset();
    225    pos = newOffset;
    226 }
    227 
    228 int32_t
    229 DataBuilderCollationIterator::getOffset() const {
    230    return pos;
    231 }
    232 
    233 UChar32
    234 DataBuilderCollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) {
    235    if(pos == s->length()) {
    236        return U_SENTINEL;
    237    }
    238    UChar32 c = s->char32At(pos);
    239    pos += U16_LENGTH(c);
    240    return c;
    241 }
    242 
    243 UChar32
    244 DataBuilderCollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) {
    245    if(pos == 0) {
    246        return U_SENTINEL;
    247    }
    248    UChar32 c = s->char32At(pos - 1);
    249    pos -= U16_LENGTH(c);
    250    return c;
    251 }
    252 
    253 void
    254 DataBuilderCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
    255    pos = s->moveIndex32(pos, num);
    256 }
    257 
    258 void
    259 DataBuilderCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
    260    pos = s->moveIndex32(pos, -num);
    261 }
    262 
    263 uint32_t
    264 DataBuilderCollationIterator::getDataCE32(UChar32 c) const {
    265    return utrie2_get32(builder.trie, c);
    266 }
    267 
    268 uint32_t
    269 DataBuilderCollationIterator::getCE32FromBuilderData(uint32_t ce32, UErrorCode &errorCode) {
    270    if (U_FAILURE(errorCode)) { return 0; }
    271    U_ASSERT(Collation::hasCE32Tag(ce32, Collation::BUILDER_DATA_TAG));
    272    if((ce32 & CollationDataBuilder::IS_BUILDER_JAMO_CE32) != 0) {
    273        UChar32 jamo = Collation::indexFromCE32(ce32);
    274        return utrie2_get32(builder.trie, jamo);
    275    } else {
    276        ConditionalCE32 *cond = builder.getConditionalCE32ForCE32(ce32);
    277        if (cond == nullptr) {
    278            errorCode = U_INTERNAL_PROGRAM_ERROR;
    279            // TODO: ICU-21531 figure out why this happens.
    280            return 0;
    281        }
    282        if(cond->builtCE32 == Collation::NO_CE32 || cond->era != builder.contextsEra) {
    283            // Build the context-sensitive mappings into their runtime form and cache the result.
    284            cond->builtCE32 = builder.buildContext(cond, errorCode);
    285            if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
    286                errorCode = U_ZERO_ERROR;
    287                builder.clearContexts();
    288                cond->builtCE32 = builder.buildContext(cond, errorCode);
    289            }
    290            cond->era = builder.contextsEra;
    291            builderData.contexts = builder.contexts.getBuffer();
    292        }
    293        return cond->builtCE32;
    294    }
    295 }
    296 
    297 // ------------------------------------------------------------------------- ***
    298 
    299 CollationDataBuilder::CollationDataBuilder(UBool icu4xMode, UErrorCode &errorCode)
    300        : nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)),
    301          base(nullptr), baseSettings(nullptr),
    302          trie(nullptr),
    303          ce32s(errorCode), ce64s(errorCode), conditionalCE32s(errorCode),
    304          modified(false),
    305          icu4xMode(icu4xMode),
    306          fastLatinEnabled(false), fastLatinBuilder(nullptr),
    307          collIter(nullptr) {
    308    // Reserve the first CE32 for U+0000.
    309    if (!icu4xMode) {
    310        ce32s.addElement(0, errorCode);
    311    }
    312    conditionalCE32s.setDeleter(uprv_deleteConditionalCE32);
    313 }
    314 
    315 CollationDataBuilder::~CollationDataBuilder() {
    316    utrie2_close(trie);
    317    delete fastLatinBuilder;
    318    delete collIter;
    319 }
    320 
    321 void
    322 CollationDataBuilder::initForTailoring(const CollationData *b, UErrorCode &errorCode) {
    323    if(U_FAILURE(errorCode)) { return; }
    324    if(trie != nullptr) {
    325        errorCode = U_INVALID_STATE_ERROR;
    326        return;
    327    }
    328    if(b == nullptr) {
    329        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    330        return;
    331    }
    332    base = b;
    333 
    334    // For a tailoring, the default is to fall back to the base.
    335    // For ICU4X, use the same value for fallback as for the default
    336    // to avoid having to have different blocks for the two.
    337    trie = utrie2_open(Collation::FALLBACK_CE32, icu4xMode ? Collation::FALLBACK_CE32 : Collation::FFFD_CE32, &errorCode);
    338 
    339    if (!icu4xMode) {
    340        // Set the Latin-1 letters block so that it is allocated first in the data array,
    341        // to try to improve locality of reference when sorting Latin-1 text.
    342        // Do not use utrie2_setRange32() since that will not actually allocate blocks
    343        // that are filled with the default value.
    344        // ASCII (0..7F) is already preallocated anyway.
    345        for(UChar32 c = 0xc0; c <= 0xff; ++c) {
    346            utrie2_set32(trie, c, Collation::FALLBACK_CE32, &errorCode);
    347        }
    348 
    349        // Hangul syllables are not tailorable (except via tailoring Jamos).
    350        // Always set the Hangul tag to help performance.
    351        // Do this here, rather than in buildMappings(),
    352        // so that we see the HANGUL_TAG in various assertions.
    353        uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0);
    354        utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, true, &errorCode);
    355 
    356        // Copy the set contents but don't copy/clone the set as a whole because
    357        // that would copy the isFrozen state too.
    358        unsafeBackwardSet.addAll(*b->unsafeBackwardSet);
    359    }
    360 
    361    if(U_FAILURE(errorCode)) { return; }
    362 }
    363 
    364 UBool
    365 CollationDataBuilder::maybeSetPrimaryRange(UChar32 start, UChar32 end,
    366                                           uint32_t primary, int32_t step,
    367                                           UErrorCode &errorCode) {
    368    if(U_FAILURE(errorCode)) { return false; }
    369    U_ASSERT(start <= end);
    370    // TODO: Do we need to check what values are currently set for start..end?
    371    // An offset range is worth it only if we can achieve an overlap between
    372    // adjacent UTrie2 blocks of 32 code points each.
    373    // An offset CE is also a little more expensive to look up and compute
    374    // than a simple CE.
    375    // If the range spans at least three UTrie2 block boundaries (> 64 code points),
    376    // then we take it.
    377    // If the range spans one or two block boundaries and there are
    378    // at least 4 code points on either side, then we take it.
    379    // (We could additionally require a minimum range length of, say, 16.)
    380    int32_t blockDelta = (end >> 5) - (start >> 5);
    381    if(2 <= step && step <= 0x7f &&
    382            (blockDelta >= 3 ||
    383            (blockDelta > 0 && (start & 0x1f) <= 0x1c && (end & 0x1f) >= 3))) {
    384        int64_t dataCE = (static_cast<int64_t>(primary) << 32) | (start << 8) | step;
    385        if(isCompressiblePrimary(primary)) { dataCE |= 0x80; }
    386        int32_t index = addCE(dataCE, errorCode);
    387        if(U_FAILURE(errorCode)) { return 0; }
    388        if(index > Collation::MAX_INDEX) {
    389            errorCode = U_BUFFER_OVERFLOW_ERROR;
    390            return 0;
    391        }
    392        uint32_t offsetCE32 = Collation::makeCE32FromTagAndIndex(Collation::OFFSET_TAG, index);
    393        utrie2_setRange32(trie, start, end, offsetCE32, true, &errorCode);
    394        modified = true;
    395        return true;
    396    } else {
    397        return false;
    398    }
    399 }
    400 
    401 uint32_t
    402 CollationDataBuilder::setPrimaryRangeAndReturnNext(UChar32 start, UChar32 end,
    403                                                   uint32_t primary, int32_t step,
    404                                                   UErrorCode &errorCode) {
    405    if(U_FAILURE(errorCode)) { return 0; }
    406    UBool isCompressible = isCompressiblePrimary(primary);
    407    if(maybeSetPrimaryRange(start, end, primary, step, errorCode)) {
    408        return Collation::incThreeBytePrimaryByOffset(primary, isCompressible,
    409                                                      (end - start + 1) * step);
    410    } else {
    411        // Short range: Set individual CE32s.
    412        for(;;) {
    413            utrie2_set32(trie, start, Collation::makeLongPrimaryCE32(primary), &errorCode);
    414            ++start;
    415            primary = Collation::incThreeBytePrimaryByOffset(primary, isCompressible, step);
    416            if(start > end) { return primary; }
    417        }
    418        modified = true;
    419    }
    420 }
    421 
    422 uint32_t
    423 CollationDataBuilder::getCE32FromOffsetCE32(UBool fromBase, UChar32 c, uint32_t ce32) const {
    424    int32_t i = Collation::indexFromCE32(ce32);
    425    int64_t dataCE = fromBase ? base->ces[i] : ce64s.elementAti(i);
    426    uint32_t p = Collation::getThreeBytePrimaryForOffsetData(c, dataCE);
    427    return Collation::makeLongPrimaryCE32(p);
    428 }
    429 
    430 UBool
    431 CollationDataBuilder::isCompressibleLeadByte(uint32_t b) const {
    432    return base->isCompressibleLeadByte(b);
    433 }
    434 
    435 UBool
    436 CollationDataBuilder::isAssigned(UChar32 c) const {
    437    return Collation::isAssignedCE32(utrie2_get32(trie, c));
    438 }
    439 
    440 uint32_t
    441 CollationDataBuilder::getLongPrimaryIfSingleCE(UChar32 c) const {
    442    uint32_t ce32 = utrie2_get32(trie, c);
    443    if(Collation::isLongPrimaryCE32(ce32)) {
    444        return Collation::primaryFromLongPrimaryCE32(ce32);
    445    } else {
    446        return 0;
    447    }
    448 }
    449 
    450 int64_t
    451 CollationDataBuilder::getSingleCE(UChar32 c, UErrorCode &errorCode) const {
    452    if(U_FAILURE(errorCode)) { return 0; }
    453    // Keep parallel with CollationData::getSingleCE().
    454    UBool fromBase = false;
    455    uint32_t ce32 = utrie2_get32(trie, c);
    456    if(ce32 == Collation::FALLBACK_CE32) {
    457        fromBase = true;
    458        ce32 = base->getCE32(c);
    459    }
    460    while(Collation::isSpecialCE32(ce32)) {
    461        switch(Collation::tagFromCE32(ce32)) {
    462        case Collation::LATIN_EXPANSION_TAG:
    463        case Collation::BUILDER_DATA_TAG:
    464        case Collation::PREFIX_TAG:
    465        case Collation::CONTRACTION_TAG:
    466        case Collation::HANGUL_TAG:
    467        case Collation::LEAD_SURROGATE_TAG:
    468            errorCode = U_UNSUPPORTED_ERROR;
    469            return 0;
    470        case Collation::FALLBACK_TAG:
    471        case Collation::RESERVED_TAG_3:
    472            errorCode = U_INTERNAL_PROGRAM_ERROR;
    473            return 0;
    474        case Collation::LONG_PRIMARY_TAG:
    475            return Collation::ceFromLongPrimaryCE32(ce32);
    476        case Collation::LONG_SECONDARY_TAG:
    477            return Collation::ceFromLongSecondaryCE32(ce32);
    478        case Collation::EXPANSION32_TAG:
    479            if(Collation::lengthFromCE32(ce32) == 1) {
    480                int32_t i = Collation::indexFromCE32(ce32);
    481                ce32 = fromBase ? base->ce32s[i] : ce32s.elementAti(i);
    482                break;
    483            } else {
    484                errorCode = U_UNSUPPORTED_ERROR;
    485                return 0;
    486            }
    487        case Collation::EXPANSION_TAG: {
    488            if(Collation::lengthFromCE32(ce32) == 1) {
    489                int32_t i = Collation::indexFromCE32(ce32);
    490                return fromBase ? base->ces[i] : ce64s.elementAti(i);
    491            } else {
    492                errorCode = U_UNSUPPORTED_ERROR;
    493                return 0;
    494            }
    495        }
    496        case Collation::DIGIT_TAG:
    497            // Fetch the non-numeric-collation CE32 and continue.
    498            ce32 = ce32s.elementAti(Collation::indexFromCE32(ce32));
    499            break;
    500        case Collation::U0000_TAG:
    501            U_ASSERT(c == 0);
    502            // Fetch the normal ce32 for U+0000 and continue.
    503            ce32 = fromBase ? base->ce32s[0] : ce32s.elementAti(0);
    504            break;
    505        case Collation::OFFSET_TAG:
    506            ce32 = getCE32FromOffsetCE32(fromBase, c, ce32);
    507            break;
    508        case Collation::IMPLICIT_TAG:
    509            return Collation::unassignedCEFromCodePoint(c);
    510        }
    511    }
    512    return Collation::ceFromSimpleCE32(ce32);
    513 }
    514 
    515 int32_t
    516 CollationDataBuilder::addCE(int64_t ce, UErrorCode &errorCode) {
    517    int32_t length = ce64s.size();
    518    for(int32_t i = 0; i < length; ++i) {
    519        if(ce == ce64s.elementAti(i)) { return i; }
    520    }
    521    ce64s.addElement(ce, errorCode);
    522    return length;
    523 }
    524 
    525 int32_t
    526 CollationDataBuilder::addCE32(uint32_t ce32, UErrorCode &errorCode) {
    527    int32_t length = ce32s.size();
    528    for(int32_t i = 0; i < length; ++i) {
    529        if (ce32 == static_cast<uint32_t>(ce32s.elementAti(i))) { return i; }
    530    }
    531    ce32s.addElement(static_cast<int32_t>(ce32), errorCode);
    532    return length;
    533 }
    534 
    535 int32_t
    536 CollationDataBuilder::addConditionalCE32(const UnicodeString &context, uint32_t ce32,
    537                                         UErrorCode &errorCode) {
    538    if(U_FAILURE(errorCode)) { return -1; }
    539    U_ASSERT(!context.isEmpty());
    540    int32_t index = conditionalCE32s.size();
    541    if(index > Collation::MAX_INDEX) {
    542        errorCode = U_BUFFER_OVERFLOW_ERROR;
    543        return -1;
    544    }
    545    LocalPointer<ConditionalCE32> cond(new ConditionalCE32(context, ce32), errorCode);
    546    conditionalCE32s.adoptElement(cond.orphan(), errorCode);
    547    if(U_FAILURE(errorCode)) {
    548        return -1;
    549    }
    550    return index;
    551 }
    552 
    553 void
    554 CollationDataBuilder::add(const UnicodeString &prefix, const UnicodeString &s,
    555                          const int64_t ces[], int32_t cesLength,
    556                          UErrorCode &errorCode) {
    557    uint32_t ce32 = encodeCEs(ces, cesLength, errorCode);
    558    addCE32(prefix, s, ce32, errorCode);
    559 }
    560 
    561 void
    562 CollationDataBuilder::addCE32(const UnicodeString &prefix, const UnicodeString &s,
    563                              uint32_t ce32, UErrorCode &errorCode) {
    564    if(U_FAILURE(errorCode)) { return; }
    565    if(s.isEmpty()) {
    566        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    567        return;
    568    }
    569    if(trie == nullptr || utrie2_isFrozen(trie)) {
    570        errorCode = U_INVALID_STATE_ERROR;
    571        return;
    572    }
    573    UChar32 c = s.char32At(0);
    574    int32_t cLength = U16_LENGTH(c);
    575    uint32_t oldCE32 = utrie2_get32(trie, c);
    576    UBool hasContext = !prefix.isEmpty() || s.length() > cLength;
    577 
    578    if (icu4xMode) {
    579        if (base && c >= 0x1100 && c < 0x1200) {
    580            // Omit jamo tailorings.
    581            // TODO(https://github.com/unicode-org/icu4x/issues/1941).
    582        }
    583        const Normalizer2* nfdNormalizer = Normalizer2::getNFDInstance(errorCode);
    584        UnicodeString sInNfd;
    585        nfdNormalizer->normalize(s, sInNfd, errorCode);
    586        if (s != sInNfd) {
    587            // s is not in NFD, so it cannot match in ICU4X, since ICU4X only
    588            // does NFD lookups.
    589 
    590            // As of Unicode 16 alpha, the cases that come here are:
    591            //
    592            // 1. The second character is a special decomposing Tibetan vowel
    593            //    sign. These are OK to ignore in the precomposed form, since
    594            //    the decomposed form is added also.
    595            // 2. Likewise for KIRAT RAI VOWEL SIGN AA followed by KIRAT RAI VOWEL SIGN AI
    596            //    and other such cases.
    597            //    For details see the normalization section of
    598            //    https://www.unicode.org/review/pri497/pri497-background.html
    599            // 3. U+FDD1 followed by U+AC00 is a marker for the alphabetical
    600            //    index feature of ICU4C, which at this time does not have
    601            //    a counterpart in ICU4X.
    602            return;
    603        }
    604 
    605        if (!prefix.isEmpty()) {
    606            UnicodeString prefixInNfd;
    607            nfdNormalizer->normalize(prefix, prefixInNfd, errorCode);
    608            if (prefix != prefixInNfd) {
    609                errorCode = U_UNSUPPORTED_ERROR;
    610                return;
    611            }
    612 
    613            int32_t count = prefix.countChar32();
    614            if (count > 2) {
    615                // Prefix too long for ICU4X.
    616                errorCode = U_UNSUPPORTED_ERROR;
    617                return;
    618            }
    619            UChar32 utf32[4];
    620            int32_t len = prefix.toUTF32(utf32, 4, errorCode);
    621            if (len != count) {
    622                errorCode = U_INVALID_STATE_ERROR;
    623                return;
    624            }
    625            UChar32 c = utf32[0];
    626            if (u_getCombiningClass(c)) {
    627                // Prefix must start with as starter for ICU4X.
    628                errorCode = U_UNSUPPORTED_ERROR;
    629                return;
    630            }
    631            // XXX: Korean searchjl has jamo in prefix, so commenting out this
    632            // check for now. ICU4X currently ignores non-root jamo tables anyway.
    633            // searchjl was added in
    634            // https://unicode-org.atlassian.net/browse/CLDR-3560
    635            // Contractions were changed to prefixes in
    636            // https://unicode-org.atlassian.net/browse/CLDR-6546
    637            //
    638            // if ((c >= 0x1100 && c < 0x1200) || (c >= 0xAC00 && c < 0xD7A4)) {
    639            //     errorCode = U_UNSUPPORTED_ERROR;
    640            //     return;
    641            // }
    642            if ((len > 1) && !(utf32[1] == 0x3099 || utf32[1] == 0x309A)) {
    643                // Second character in prefix, if present, must be a kana voicing mark for ICU4X.
    644                errorCode = U_UNSUPPORTED_ERROR;
    645                return;
    646            }
    647        }
    648 
    649        if (s.length() > cLength) {
    650            // Check that there's no modern Hangul in contractions.
    651            for (int32_t i = 0; i < s.length(); ++i) {
    652                char16_t c = s.charAt(i);
    653                if ((c >= 0x1100 && c < 0x1100 + 19) || (c >= 0x1161 && c < 0x1161 + 21) || (c >= 0x11A7 && c < 0x11A7 + 28) || (c >= 0xAC00 && c < 0xD7A4)) {
    654                    errorCode = U_UNSUPPORTED_ERROR;
    655                    return;
    656                }
    657            }
    658            int32_t sCount = s.countChar32();
    659            UChar32 sUtf32[32];
    660            int32_t sLen = s.toUTF32(sUtf32, 32, errorCode);
    661            if (sLen != sCount) {
    662                // If this error is ever reached, just increase the buffer
    663                // size above.
    664                errorCode = U_UNSUPPORTED_ERROR;
    665                return;
    666            }
    667            for (int32_t i = 1; i < sLen - 1; ++i) {
    668                if (u_getCombiningClass(sUtf32[i]) == 0) {
    669                    contractionMiddleStarter.add(sUtf32[i]);
    670                }
    671            }
    672        }
    673    }
    674 
    675    if(oldCE32 == Collation::FALLBACK_CE32) {
    676        // First tailoring for c.
    677        // If c has contextual base mappings or if we add a contextual mapping,
    678        // then copy the base mappings.
    679        // Otherwise we just override the base mapping.
    680        uint32_t baseCE32 = base->getFinalCE32(base->getCE32(c));
    681        if(hasContext || Collation::ce32HasContext(baseCE32)) {
    682            oldCE32 = copyFromBaseCE32(c, baseCE32, true, errorCode);
    683            utrie2_set32(trie, c, oldCE32, &errorCode);
    684            if(U_FAILURE(errorCode)) { return; }
    685        }
    686    }
    687    if(!hasContext) {
    688        // No prefix, no contraction.
    689        if(!isBuilderContextCE32(oldCE32)) {
    690            utrie2_set32(trie, c, ce32, &errorCode);
    691        } else {
    692            ConditionalCE32 *cond = getConditionalCE32ForCE32(oldCE32);
    693            cond->builtCE32 = Collation::NO_CE32;
    694            cond->ce32 = ce32;
    695        }
    696    } else {
    697        ConditionalCE32 *cond;
    698        if(!isBuilderContextCE32(oldCE32)) {
    699            // Replace the simple oldCE32 with a builder context CE32
    700            // pointing to a new ConditionalCE32 list head.
    701            int32_t index = addConditionalCE32(UnicodeString(static_cast<char16_t>(0)), oldCE32, errorCode);
    702            if(U_FAILURE(errorCode)) { return; }
    703            uint32_t contextCE32 = makeBuilderContextCE32(index);
    704            utrie2_set32(trie, c, contextCE32, &errorCode);
    705            contextChars.add(c);
    706            cond = getConditionalCE32(index);
    707        } else {
    708            cond = getConditionalCE32ForCE32(oldCE32);
    709            cond->builtCE32 = Collation::NO_CE32;
    710        }
    711        UnicodeString suffix(s, cLength);
    712        UnicodeString context(static_cast<char16_t>(prefix.length()));
    713        context.append(prefix).append(suffix);
    714        if (icu4xMode && !suffix.isEmpty() && !prefix.isEmpty()) {
    715            // ICU4X does not support the combination of prefix and contraction.
    716            // This combination is supported by LDML but does not occur in the
    717            // root or any tailorings in CLDR as of February 2025.
    718            // If support for this case becomes necessary, a practical change
    719            // would be allocating a flag on prefix ce32 and setting the
    720            // flag on a prefix ce32 if any ce32 that can be found under
    721            // the prefix ce32 (either the default or any UCharsTrie value) is
    722            // a contraction ce32 or if the prefix ce32 is the utrie2 value
    723            // for a character that is a starter that occurs in a middle
    724            // (neither first nor last) position in a contraction.
    725            errorCode = U_UNSUPPORTED_ERROR;
    726            return;
    727        }
    728        unsafeBackwardSet.addAll(suffix);
    729        for(;;) {
    730            // invariant: context > cond->context
    731            int32_t next = cond->next;
    732            if(next < 0) {
    733                // Append a new ConditionalCE32 after cond.
    734                int32_t index = addConditionalCE32(context, ce32, errorCode);
    735                if(U_FAILURE(errorCode)) { return; }
    736                cond->next = index;
    737                break;
    738            }
    739            ConditionalCE32 *nextCond = getConditionalCE32(next);
    740            int8_t cmp = context.compare(nextCond->context);
    741            if(cmp < 0) {
    742                // Insert a new ConditionalCE32 between cond and nextCond.
    743                int32_t index = addConditionalCE32(context, ce32, errorCode);
    744                if(U_FAILURE(errorCode)) { return; }
    745                cond->next = index;
    746                getConditionalCE32(index)->next = next;
    747                break;
    748            } else if(cmp == 0) {
    749                // Same context as before, overwrite its ce32.
    750                nextCond->ce32 = ce32;
    751                break;
    752            }
    753            cond = nextCond;
    754        }
    755    }
    756    modified = true;
    757 }
    758 
    759 uint32_t
    760 CollationDataBuilder::encodeOneCEAsCE32(int64_t ce) {
    761    uint32_t p = static_cast<uint32_t>(ce >> 32);
    762    uint32_t lower32 = static_cast<uint32_t>(ce);
    763    uint32_t t = static_cast<uint32_t>(ce & 0xffff);
    764    U_ASSERT((t & 0xc000) != 0xc000);  // Impossible case bits 11 mark special CE32s.
    765    if((ce & INT64_C(0xffff00ff00ff)) == 0) {
    766        // normal form ppppsstt
    767        return p | (lower32 >> 16) | (t >> 8);
    768    } else if((ce & INT64_C(0xffffffffff)) == Collation::COMMON_SEC_AND_TER_CE) {
    769        // long-primary form ppppppC1
    770        return Collation::makeLongPrimaryCE32(p);
    771    } else if(p == 0 && (t & 0xff) == 0) {
    772        // long-secondary form ssssttC2
    773        return Collation::makeLongSecondaryCE32(lower32);
    774    }
    775    return Collation::NO_CE32;
    776 }
    777 
    778 uint32_t
    779 CollationDataBuilder::encodeOneCE(int64_t ce, UErrorCode &errorCode) {
    780    // Try to encode one CE as one CE32.
    781    uint32_t ce32 = encodeOneCEAsCE32(ce);
    782    if(ce32 != Collation::NO_CE32) { return ce32; }
    783    int32_t index = addCE(ce, errorCode);
    784    if(U_FAILURE(errorCode)) { return 0; }
    785    if(index > Collation::MAX_INDEX) {
    786        errorCode = U_BUFFER_OVERFLOW_ERROR;
    787        return 0;
    788    }
    789    return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION_TAG, index, 1);
    790 }
    791 
    792 uint32_t
    793 CollationDataBuilder::encodeCEs(const int64_t ces[], int32_t cesLength,
    794                                UErrorCode &errorCode) {
    795    if(U_FAILURE(errorCode)) { return 0; }
    796    if(cesLength < 0 || cesLength > Collation::MAX_EXPANSION_LENGTH) {
    797        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    798        return 0;
    799    }
    800    if(trie == nullptr || utrie2_isFrozen(trie)) {
    801        errorCode = U_INVALID_STATE_ERROR;
    802        return 0;
    803    }
    804    if(cesLength == 0) {
    805        // Convenience: We cannot map to nothing, but we can map to a completely ignorable CE.
    806        // Do this here so that callers need not do it.
    807        return encodeOneCEAsCE32(0);
    808    } else if(cesLength == 1) {
    809        return encodeOneCE(ces[0], errorCode);
    810    } else if(cesLength == 2 && !icu4xMode) {
    811        // Try to encode two CEs as one CE32.
    812        // Turn this off for ICU4X, because without the canonical closure
    813        // these are so rare that it doesn't make sense to spend a branch
    814        // on checking this tag when using the data.
    815        int64_t ce0 = ces[0];
    816        int64_t ce1 = ces[1];
    817        uint32_t p0 = static_cast<uint32_t>(ce0 >> 32);
    818        if((ce0 & INT64_C(0xffffffffff00ff)) == Collation::COMMON_SECONDARY_CE &&
    819                (ce1 & INT64_C(0xffffffff00ffffff)) == Collation::COMMON_TERTIARY_CE &&
    820                p0 != 0) {
    821            // Latin mini expansion
    822            return
    823                p0 |
    824                ((static_cast<uint32_t>(ce0) & 0xff00u) << 8) |
    825                static_cast<uint32_t>(ce1 >> 16) |
    826                Collation::SPECIAL_CE32_LOW_BYTE |
    827                Collation::LATIN_EXPANSION_TAG;
    828        }
    829    }
    830    // Try to encode two or more CEs as CE32s.
    831    int32_t newCE32s[Collation::MAX_EXPANSION_LENGTH];
    832    for(int32_t i = 0;; ++i) {
    833        if(i == cesLength) {
    834            return encodeExpansion32(newCE32s, cesLength, errorCode);
    835        }
    836        uint32_t ce32 = encodeOneCEAsCE32(ces[i]);
    837        if(ce32 == Collation::NO_CE32) { break; }
    838        newCE32s[i] = static_cast<int32_t>(ce32);
    839    }
    840    return encodeExpansion(ces, cesLength, errorCode);
    841 }
    842 
    843 uint32_t
    844 CollationDataBuilder::encodeExpansion(const int64_t ces[], int32_t length, UErrorCode &errorCode) {
    845    if(U_FAILURE(errorCode)) { return 0; }
    846    // See if this sequence of CEs has already been stored.
    847    int64_t first = ces[0];
    848    int32_t ce64sMax = ce64s.size() - length;
    849    for(int32_t i = 0; i <= ce64sMax; ++i) {
    850        if(first == ce64s.elementAti(i)) {
    851            if(i > Collation::MAX_INDEX) {
    852                errorCode = U_BUFFER_OVERFLOW_ERROR;
    853                return 0;
    854            }
    855            for(int32_t j = 1;; ++j) {
    856                if(j == length) {
    857                    return Collation::makeCE32FromTagIndexAndLength(
    858                            Collation::EXPANSION_TAG, i, length);
    859                }
    860                if(ce64s.elementAti(i + j) != ces[j]) { break; }
    861            }
    862        }
    863    }
    864    // Store the new sequence.
    865    int32_t i = ce64s.size();
    866    if(i > Collation::MAX_INDEX) {
    867        errorCode = U_BUFFER_OVERFLOW_ERROR;
    868        return 0;
    869    }
    870    for(int32_t j = 0; j < length; ++j) {
    871        ce64s.addElement(ces[j], errorCode);
    872    }
    873    return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION_TAG, i, length);
    874 }
    875 
    876 uint32_t
    877 CollationDataBuilder::encodeExpansion32(const int32_t newCE32s[], int32_t length,
    878                                        UErrorCode &errorCode) {
    879    if(U_FAILURE(errorCode)) { return 0; }
    880    // See if this sequence of CE32s has already been stored.
    881    int32_t first = newCE32s[0];
    882    int32_t ce32sMax = ce32s.size() - length;
    883    for(int32_t i = 0; i <= ce32sMax; ++i) {
    884        if(first == ce32s.elementAti(i)) {
    885            if(i > Collation::MAX_INDEX) {
    886                errorCode = U_BUFFER_OVERFLOW_ERROR;
    887                return 0;
    888            }
    889            for(int32_t j = 1;; ++j) {
    890                if(j == length) {
    891                    return Collation::makeCE32FromTagIndexAndLength(
    892                            Collation::EXPANSION32_TAG, i, length);
    893                }
    894                if(ce32s.elementAti(i + j) != newCE32s[j]) { break; }
    895            }
    896        }
    897    }
    898    // Store the new sequence.
    899    int32_t i = ce32s.size();
    900    if(i > Collation::MAX_INDEX) {
    901        errorCode = U_BUFFER_OVERFLOW_ERROR;
    902        return 0;
    903    }
    904    for(int32_t j = 0; j < length; ++j) {
    905        ce32s.addElement(newCE32s[j], errorCode);
    906    }
    907    return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION32_TAG, i, length);
    908 }
    909 
    910 uint32_t
    911 CollationDataBuilder::copyFromBaseCE32(UChar32 c, uint32_t ce32, UBool withContext,
    912                                       UErrorCode &errorCode) {
    913    if(U_FAILURE(errorCode)) { return 0; }
    914    if(!Collation::isSpecialCE32(ce32)) { return ce32; }
    915    switch(Collation::tagFromCE32(ce32)) {
    916    case Collation::LONG_PRIMARY_TAG:
    917    case Collation::LONG_SECONDARY_TAG:
    918    case Collation::LATIN_EXPANSION_TAG:
    919        // copy as is
    920        break;
    921    case Collation::EXPANSION32_TAG: {
    922        const uint32_t *baseCE32s = base->ce32s + Collation::indexFromCE32(ce32);
    923        int32_t length = Collation::lengthFromCE32(ce32);
    924        ce32 = encodeExpansion32(
    925            reinterpret_cast<const int32_t *>(baseCE32s), length, errorCode);
    926        break;
    927    }
    928    case Collation::EXPANSION_TAG: {
    929        const int64_t *baseCEs = base->ces + Collation::indexFromCE32(ce32);
    930        int32_t length = Collation::lengthFromCE32(ce32);
    931        ce32 = encodeExpansion(baseCEs, length, errorCode);
    932        break;
    933    }
    934    case Collation::PREFIX_TAG: {
    935        // Flatten prefixes and nested suffixes (contractions)
    936        // into a linear list of ConditionalCE32.
    937        const char16_t *p = base->contexts + Collation::indexFromCE32(ce32);
    938        ce32 = CollationData::readCE32(p);  // Default if no prefix match.
    939        if(!withContext) {
    940            return copyFromBaseCE32(c, ce32, false, errorCode);
    941        }
    942        ConditionalCE32 head;
    943        UnicodeString context(static_cast<char16_t>(0));
    944        int32_t index;
    945        if(Collation::isContractionCE32(ce32)) {
    946            index = copyContractionsFromBaseCE32(context, c, ce32, &head, errorCode);
    947        } else {
    948            ce32 = copyFromBaseCE32(c, ce32, true, errorCode);
    949            head.next = index = addConditionalCE32(context, ce32, errorCode);
    950        }
    951        if(U_FAILURE(errorCode)) { return 0; }
    952        ConditionalCE32 *cond = getConditionalCE32(index);  // the last ConditionalCE32 so far
    953        UCharsTrie::Iterator prefixes(p + 2, 0, errorCode);
    954        while(prefixes.next(errorCode)) {
    955            context = prefixes.getString();
    956            context.reverse();
    957            context.insert(0, static_cast<char16_t>(context.length()));
    958            ce32 = static_cast<uint32_t>(prefixes.getValue());
    959            if(Collation::isContractionCE32(ce32)) {
    960                index = copyContractionsFromBaseCE32(context, c, ce32, cond, errorCode);
    961            } else {
    962                ce32 = copyFromBaseCE32(c, ce32, true, errorCode);
    963                cond->next = index = addConditionalCE32(context, ce32, errorCode);
    964            }
    965            if(U_FAILURE(errorCode)) { return 0; }
    966            cond = getConditionalCE32(index);
    967        }
    968        ce32 = makeBuilderContextCE32(head.next);
    969        contextChars.add(c);
    970        break;
    971    }
    972    case Collation::CONTRACTION_TAG: {
    973        if(!withContext) {
    974            const char16_t *p = base->contexts + Collation::indexFromCE32(ce32);
    975            ce32 = CollationData::readCE32(p);  // Default if no suffix match.
    976            return copyFromBaseCE32(c, ce32, false, errorCode);
    977        }
    978        ConditionalCE32 head;
    979        UnicodeString context(static_cast<char16_t>(0));
    980        copyContractionsFromBaseCE32(context, c, ce32, &head, errorCode);
    981        ce32 = makeBuilderContextCE32(head.next);
    982        contextChars.add(c);
    983        break;
    984    }
    985    case Collation::HANGUL_TAG:
    986        errorCode = U_UNSUPPORTED_ERROR;  // We forbid tailoring of Hangul syllables.
    987        break;
    988    case Collation::OFFSET_TAG:
    989        ce32 = getCE32FromOffsetCE32(true, c, ce32);
    990        break;
    991    case Collation::IMPLICIT_TAG:
    992        ce32 = encodeOneCE(Collation::unassignedCEFromCodePoint(c), errorCode);
    993        break;
    994    default:
    995        UPRV_UNREACHABLE_EXIT;  // require ce32 == base->getFinalCE32(ce32)
    996    }
    997    return ce32;
    998 }
    999 
   1000 int32_t
   1001 CollationDataBuilder::copyContractionsFromBaseCE32(UnicodeString &context, UChar32 c, uint32_t ce32,
   1002                                                   ConditionalCE32 *cond, UErrorCode &errorCode) {
   1003    if(U_FAILURE(errorCode)) { return 0; }
   1004    const char16_t *p = base->contexts + Collation::indexFromCE32(ce32);
   1005    int32_t index;
   1006    if((ce32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
   1007        // No match on the single code point.
   1008        // We are underneath a prefix, and the default mapping is just
   1009        // a fallback to the mappings for a shorter prefix.
   1010        U_ASSERT(context.length() > 1);
   1011        index = -1;
   1012    } else {
   1013        ce32 = CollationData::readCE32(p);  // Default if no suffix match.
   1014        U_ASSERT(!Collation::isContractionCE32(ce32));
   1015        ce32 = copyFromBaseCE32(c, ce32, true, errorCode);
   1016        cond->next = index = addConditionalCE32(context, ce32, errorCode);
   1017        if(U_FAILURE(errorCode)) { return 0; }
   1018        cond = getConditionalCE32(index);
   1019    }
   1020 
   1021    int32_t suffixStart = context.length();
   1022    UCharsTrie::Iterator suffixes(p + 2, 0, errorCode);
   1023    while(suffixes.next(errorCode)) {
   1024        context.append(suffixes.getString());
   1025        ce32 = copyFromBaseCE32(c, static_cast<uint32_t>(suffixes.getValue()), true, errorCode);
   1026        cond->next = index = addConditionalCE32(context, ce32, errorCode);
   1027        if(U_FAILURE(errorCode)) { return 0; }
   1028        // No need to update the unsafeBackwardSet because the tailoring set
   1029        // is already a copy of the base set.
   1030        cond = getConditionalCE32(index);
   1031        context.truncate(suffixStart);
   1032    }
   1033    U_ASSERT(index >= 0);
   1034    return index;
   1035 }
   1036 
   1037 class CopyHelper {
   1038 public:
   1039    CopyHelper(const CollationDataBuilder &s, CollationDataBuilder &d,
   1040               const CollationDataBuilder::CEModifier &m, UErrorCode &initialErrorCode)
   1041            : src(s), dest(d), modifier(m),
   1042              errorCode(initialErrorCode) {}
   1043 
   1044    UBool copyRangeCE32(UChar32 start, UChar32 end, uint32_t ce32) {
   1045        ce32 = copyCE32(ce32);
   1046        utrie2_setRange32(dest.trie, start, end, ce32, true, &errorCode);
   1047        if(CollationDataBuilder::isBuilderContextCE32(ce32)) {
   1048            dest.contextChars.add(start, end);
   1049        }
   1050        return U_SUCCESS(errorCode);
   1051    }
   1052 
   1053    uint32_t copyCE32(uint32_t ce32) {
   1054        if(!Collation::isSpecialCE32(ce32)) {
   1055            int64_t ce = modifier.modifyCE32(ce32);
   1056            if(ce != Collation::NO_CE) {
   1057                ce32 = dest.encodeOneCE(ce, errorCode);
   1058            }
   1059        } else {
   1060            int32_t tag = Collation::tagFromCE32(ce32);
   1061            if(tag == Collation::EXPANSION32_TAG) {
   1062                const uint32_t *srcCE32s = reinterpret_cast<uint32_t *>(src.ce32s.getBuffer());
   1063                srcCE32s += Collation::indexFromCE32(ce32);
   1064                int32_t length = Collation::lengthFromCE32(ce32);
   1065                // Inspect the source CE32s. Just copy them if none are modified.
   1066                // Otherwise copy to modifiedCEs, with modifications.
   1067                UBool isModified = false;
   1068                for(int32_t i = 0; i < length; ++i) {
   1069                    ce32 = srcCE32s[i];
   1070                    int64_t ce;
   1071                    if(Collation::isSpecialCE32(ce32) ||
   1072                            (ce = modifier.modifyCE32(ce32)) == Collation::NO_CE) {
   1073                        if(isModified) {
   1074                            modifiedCEs[i] = Collation::ceFromCE32(ce32);
   1075                        }
   1076                    } else {
   1077                        if(!isModified) {
   1078                            for(int32_t j = 0; j < i; ++j) {
   1079                                modifiedCEs[j] = Collation::ceFromCE32(srcCE32s[j]);
   1080                            }
   1081                            isModified = true;
   1082                        }
   1083                        modifiedCEs[i] = ce;
   1084                    }
   1085                }
   1086                if(isModified) {
   1087                    ce32 = dest.encodeCEs(modifiedCEs, length, errorCode);
   1088                } else {
   1089                    ce32 = dest.encodeExpansion32(
   1090                        reinterpret_cast<const int32_t *>(srcCE32s), length, errorCode);
   1091                }
   1092            } else if(tag == Collation::EXPANSION_TAG) {
   1093                const int64_t *srcCEs = src.ce64s.getBuffer();
   1094                srcCEs += Collation::indexFromCE32(ce32);
   1095                int32_t length = Collation::lengthFromCE32(ce32);
   1096                // Inspect the source CEs. Just copy them if none are modified.
   1097                // Otherwise copy to modifiedCEs, with modifications.
   1098                UBool isModified = false;
   1099                for(int32_t i = 0; i < length; ++i) {
   1100                    int64_t srcCE = srcCEs[i];
   1101                    int64_t ce = modifier.modifyCE(srcCE);
   1102                    if(ce == Collation::NO_CE) {
   1103                        if(isModified) {
   1104                            modifiedCEs[i] = srcCE;
   1105                        }
   1106                    } else {
   1107                        if(!isModified) {
   1108                            for(int32_t j = 0; j < i; ++j) {
   1109                                modifiedCEs[j] = srcCEs[j];
   1110                            }
   1111                            isModified = true;
   1112                        }
   1113                        modifiedCEs[i] = ce;
   1114                    }
   1115                }
   1116                if(isModified) {
   1117                    ce32 = dest.encodeCEs(modifiedCEs, length, errorCode);
   1118                } else {
   1119                    ce32 = dest.encodeExpansion(srcCEs, length, errorCode);
   1120                }
   1121            } else if(tag == Collation::BUILDER_DATA_TAG) {
   1122                // Copy the list of ConditionalCE32.
   1123                ConditionalCE32 *cond = src.getConditionalCE32ForCE32(ce32);
   1124                U_ASSERT(!cond->hasContext());
   1125                int32_t destIndex = dest.addConditionalCE32(
   1126                        cond->context, copyCE32(cond->ce32), errorCode);
   1127                ce32 = CollationDataBuilder::makeBuilderContextCE32(destIndex);
   1128                while(cond->next >= 0) {
   1129                    cond = src.getConditionalCE32(cond->next);
   1130                    ConditionalCE32 *prevDestCond = dest.getConditionalCE32(destIndex);
   1131                    destIndex = dest.addConditionalCE32(
   1132                            cond->context, copyCE32(cond->ce32), errorCode);
   1133                    int32_t suffixStart = cond->prefixLength() + 1;
   1134                    dest.unsafeBackwardSet.addAll(cond->context.tempSubString(suffixStart));
   1135                    prevDestCond->next = destIndex;
   1136                }
   1137            } else {
   1138                // Just copy long CEs and Latin mini expansions (and other expected values) as is,
   1139                // assuming that the modifier would not modify them.
   1140                U_ASSERT(tag == Collation::LONG_PRIMARY_TAG ||
   1141                        tag == Collation::LONG_SECONDARY_TAG ||
   1142                        tag == Collation::LATIN_EXPANSION_TAG ||
   1143                        tag == Collation::HANGUL_TAG);
   1144            }
   1145        }
   1146        return ce32;
   1147    }
   1148 
   1149    const CollationDataBuilder &src;
   1150    CollationDataBuilder &dest;
   1151    const CollationDataBuilder::CEModifier &modifier;
   1152    int64_t modifiedCEs[Collation::MAX_EXPANSION_LENGTH];
   1153    UErrorCode errorCode;
   1154 };
   1155 
   1156 U_CDECL_BEGIN
   1157 
   1158 static UBool U_CALLCONV
   1159 enumRangeForCopy(const void *context, UChar32 start, UChar32 end, uint32_t value) {
   1160    return
   1161        value == Collation::UNASSIGNED_CE32 || value == Collation::FALLBACK_CE32 ||
   1162        ((CopyHelper *)context)->copyRangeCE32(start, end, value);
   1163 }
   1164 
   1165 U_CDECL_END
   1166 
   1167 void
   1168 CollationDataBuilder::copyFrom(const CollationDataBuilder &src, const CEModifier &modifier,
   1169                               UErrorCode &errorCode) {
   1170    if(U_FAILURE(errorCode)) { return; }
   1171    if(trie == nullptr || utrie2_isFrozen(trie)) {
   1172        errorCode = U_INVALID_STATE_ERROR;
   1173        return;
   1174    }
   1175    CopyHelper helper(src, *this, modifier, errorCode);
   1176    utrie2_enum(src.trie, nullptr, enumRangeForCopy, &helper);
   1177    errorCode = helper.errorCode;
   1178    // Update the contextChars and the unsafeBackwardSet while copying,
   1179    // in case a character had conditional mappings in the source builder
   1180    // and they were removed later.
   1181    modified |= src.modified;
   1182 }
   1183 
   1184 void
   1185 CollationDataBuilder::optimize(const UnicodeSet &set, UErrorCode &errorCode) {
   1186    if(U_FAILURE(errorCode) || set.isEmpty()) { return; }
   1187    UnicodeSetIterator iter(set);
   1188    while(iter.next() && !iter.isString()) {
   1189        UChar32 c = iter.getCodepoint();
   1190        uint32_t ce32 = utrie2_get32(trie, c);
   1191        if(ce32 == Collation::FALLBACK_CE32) {
   1192            ce32 = base->getFinalCE32(base->getCE32(c));
   1193            ce32 = copyFromBaseCE32(c, ce32, true, errorCode);
   1194            utrie2_set32(trie, c, ce32, &errorCode);
   1195        }
   1196    }
   1197    modified = true;
   1198 }
   1199 
   1200 void
   1201 CollationDataBuilder::suppressContractions(const UnicodeSet &set, UErrorCode &errorCode) {
   1202    if(U_FAILURE(errorCode) || set.isEmpty()) { return; }
   1203    UnicodeSetIterator iter(set);
   1204    while(iter.next() && !iter.isString()) {
   1205        UChar32 c = iter.getCodepoint();
   1206        uint32_t ce32 = utrie2_get32(trie, c);
   1207        if(ce32 == Collation::FALLBACK_CE32) {
   1208            ce32 = base->getFinalCE32(base->getCE32(c));
   1209            if(Collation::ce32HasContext(ce32)) {
   1210                ce32 = copyFromBaseCE32(c, ce32, false /* without context */, errorCode);
   1211                utrie2_set32(trie, c, ce32, &errorCode);
   1212            }
   1213        } else if(isBuilderContextCE32(ce32)) {
   1214            ce32 = getConditionalCE32ForCE32(ce32)->ce32;
   1215            // Simply abandon the list of ConditionalCE32.
   1216            // The caller will copy this builder in the end,
   1217            // eliminating unreachable data.
   1218            utrie2_set32(trie, c, ce32, &errorCode);
   1219            contextChars.remove(c);
   1220        }
   1221    }
   1222    modified = true;
   1223 }
   1224 
   1225 UBool
   1226 CollationDataBuilder::getJamoCE32s(uint32_t jamoCE32s[], UErrorCode &errorCode) {
   1227    if(U_FAILURE(errorCode)) { return false; }
   1228    UBool anyJamoAssigned = base == nullptr;  // always set jamoCE32s in the base data
   1229    UBool needToCopyFromBase = false;
   1230    for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) {  // Count across Jamo types.
   1231        UChar32 jamo = jamoCpFromIndex(j);
   1232        UBool fromBase = false;
   1233        uint32_t ce32 = utrie2_get32(trie, jamo);
   1234        anyJamoAssigned |= Collation::isAssignedCE32(ce32);
   1235        // TODO: Try to prevent [optimize [Jamo]] from counting as anyJamoAssigned.
   1236        // (As of CLDR 24 [2013] the Korean tailoring does not optimize conjoining Jamo.)
   1237        if(ce32 == Collation::FALLBACK_CE32) {
   1238            fromBase = true;
   1239            ce32 = base->getCE32(jamo);
   1240        }
   1241        if(Collation::isSpecialCE32(ce32)) {
   1242            switch(Collation::tagFromCE32(ce32)) {
   1243            case Collation::LONG_PRIMARY_TAG:
   1244            case Collation::LONG_SECONDARY_TAG:
   1245            case Collation::LATIN_EXPANSION_TAG:
   1246                // Copy the ce32 as-is.
   1247                break;
   1248            case Collation::EXPANSION32_TAG:
   1249            case Collation::EXPANSION_TAG:
   1250            case Collation::PREFIX_TAG:
   1251            case Collation::CONTRACTION_TAG:
   1252                if(fromBase) {
   1253                    // Defer copying until we know if anyJamoAssigned.
   1254                    ce32 = Collation::FALLBACK_CE32;
   1255                    needToCopyFromBase = true;
   1256                }
   1257                break;
   1258            case Collation::IMPLICIT_TAG:
   1259                // An unassigned Jamo should only occur in tests with incomplete bases.
   1260                U_ASSERT(fromBase);
   1261                ce32 = Collation::FALLBACK_CE32;
   1262                needToCopyFromBase = true;
   1263                break;
   1264            case Collation::OFFSET_TAG:
   1265                ce32 = getCE32FromOffsetCE32(fromBase, jamo, ce32);
   1266                break;
   1267            case Collation::FALLBACK_TAG:
   1268            case Collation::RESERVED_TAG_3:
   1269            case Collation::BUILDER_DATA_TAG:
   1270            case Collation::DIGIT_TAG:
   1271            case Collation::U0000_TAG:
   1272            case Collation::HANGUL_TAG:
   1273            case Collation::LEAD_SURROGATE_TAG:
   1274                errorCode = U_INTERNAL_PROGRAM_ERROR;
   1275                return false;
   1276            }
   1277        }
   1278        jamoCE32s[j] = ce32;
   1279    }
   1280    if(anyJamoAssigned && needToCopyFromBase) {
   1281        for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) {
   1282            if(jamoCE32s[j] == Collation::FALLBACK_CE32) {
   1283                UChar32 jamo = jamoCpFromIndex(j);
   1284                jamoCE32s[j] = copyFromBaseCE32(jamo, base->getCE32(jamo),
   1285                                                /*withContext=*/ true, errorCode);
   1286            }
   1287        }
   1288    }
   1289    return anyJamoAssigned && U_SUCCESS(errorCode);
   1290 }
   1291 
   1292 void
   1293 CollationDataBuilder::setDigitTags(UErrorCode &errorCode) {
   1294    UnicodeSet digits(UNICODE_STRING_SIMPLE("[:Nd:]"), errorCode);
   1295    if(U_FAILURE(errorCode)) { return; }
   1296    UnicodeSetIterator iter(digits);
   1297    while(iter.next()) {
   1298        U_ASSERT(!iter.isString());
   1299        UChar32 c = iter.getCodepoint();
   1300        uint32_t ce32 = utrie2_get32(trie, c);
   1301        if(ce32 != Collation::FALLBACK_CE32 && ce32 != Collation::UNASSIGNED_CE32) {
   1302            int32_t index = addCE32(ce32, errorCode);
   1303            if(U_FAILURE(errorCode)) { return; }
   1304            if(index > Collation::MAX_INDEX) {
   1305                errorCode = U_BUFFER_OVERFLOW_ERROR;
   1306                return;
   1307            }
   1308            ce32 = Collation::makeCE32FromTagIndexAndLength(
   1309                    Collation::DIGIT_TAG, index, u_charDigitValue(c));
   1310            utrie2_set32(trie, c, ce32, &errorCode);
   1311        }
   1312    }
   1313 }
   1314 
   1315 U_CDECL_BEGIN
   1316 
   1317 static UBool U_CALLCONV
   1318 enumRangeLeadValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) {
   1319    int32_t *pValue = (int32_t *)context;
   1320    if(value == Collation::UNASSIGNED_CE32) {
   1321        value = Collation::LEAD_ALL_UNASSIGNED;
   1322    } else if(value == Collation::FALLBACK_CE32) {
   1323        value = Collation::LEAD_ALL_FALLBACK;
   1324    } else {
   1325        *pValue = Collation::LEAD_MIXED;
   1326        return false;
   1327    }
   1328    if(*pValue < 0) {
   1329        *pValue = (int32_t)value;
   1330    } else if(*pValue != (int32_t)value) {
   1331        *pValue = Collation::LEAD_MIXED;
   1332        return false;
   1333    }
   1334    return true;
   1335 }
   1336 
   1337 U_CDECL_END
   1338 
   1339 void
   1340 CollationDataBuilder::setLeadSurrogates(UErrorCode &errorCode) {
   1341    for(char16_t lead = 0xd800; lead < 0xdc00; ++lead) {
   1342        int32_t value = -1;
   1343        utrie2_enumForLeadSurrogate(trie, lead, nullptr, enumRangeLeadValue, &value);
   1344        utrie2_set32ForLeadSurrogateCodeUnit(
   1345            trie, lead,
   1346            Collation::makeCE32FromTagAndIndex(Collation::LEAD_SURROGATE_TAG, 0) | static_cast<uint32_t>(value),
   1347            &errorCode);
   1348    }
   1349 }
   1350 
   1351 void
   1352 CollationDataBuilder::build(CollationData &data, UErrorCode &errorCode) {
   1353    buildMappings(data, errorCode);
   1354    if(base != nullptr) {
   1355        data.numericPrimary = base->numericPrimary;
   1356        data.compressibleBytes = base->compressibleBytes;
   1357        data.numScripts = base->numScripts;
   1358        data.scriptsIndex = base->scriptsIndex;
   1359        data.scriptStarts = base->scriptStarts;
   1360        data.scriptStartsLength = base->scriptStartsLength;
   1361    }
   1362    buildFastLatinTable(data, errorCode);
   1363 }
   1364 
   1365 void
   1366 CollationDataBuilder::buildMappings(CollationData &data, UErrorCode &errorCode) {
   1367    if(U_FAILURE(errorCode)) { return; }
   1368    if(trie == nullptr || utrie2_isFrozen(trie)) {
   1369        errorCode = U_INVALID_STATE_ERROR;
   1370        return;
   1371    }
   1372 
   1373    buildContexts(errorCode);
   1374 
   1375    uint32_t jamoCE32s[CollationData::JAMO_CE32S_LENGTH];
   1376    int32_t jamoIndex = -1;
   1377    if(getJamoCE32s(jamoCE32s, errorCode)) {
   1378        jamoIndex = ce32s.size();
   1379        for(int32_t i = 0; i < CollationData::JAMO_CE32S_LENGTH; ++i) {
   1380            ce32s.addElement(static_cast<int32_t>(jamoCE32s[i]), errorCode);
   1381        }
   1382        // Small optimization: Use a bit in the Hangul ce32
   1383        // to indicate that none of the Jamo CE32s are isSpecialCE32()
   1384        // (as it should be in the root collator).
   1385        // It allows CollationIterator to avoid recursive function calls and per-Jamo tests.
   1386        // In order to still have good trie compression and keep this code simple,
   1387        // we only set this flag if a whole block of 588 Hangul syllables starting with
   1388        // a common leading consonant (Jamo L) has this property.
   1389        UBool isAnyJamoVTSpecial = false;
   1390        for(int32_t i = Hangul::JAMO_L_COUNT; i < CollationData::JAMO_CE32S_LENGTH; ++i) {
   1391            if(Collation::isSpecialCE32(jamoCE32s[i])) {
   1392                isAnyJamoVTSpecial = true;
   1393                break;
   1394            }
   1395        }
   1396        uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0);
   1397        UChar32 c = Hangul::HANGUL_BASE;
   1398        for(int32_t i = 0; i < Hangul::JAMO_L_COUNT; ++i) {  // iterate over the Jamo L
   1399            uint32_t ce32 = hangulCE32;
   1400            if(!isAnyJamoVTSpecial && !Collation::isSpecialCE32(jamoCE32s[i])) {
   1401                ce32 |= Collation::HANGUL_NO_SPECIAL_JAMO;
   1402            }
   1403            UChar32 limit = c + Hangul::JAMO_VT_COUNT;
   1404            utrie2_setRange32(trie, c, limit - 1, ce32, true, &errorCode);
   1405            c = limit;
   1406        }
   1407    } else {
   1408        // Copy the Hangul CE32s from the base in blocks per Jamo L,
   1409        // assuming that HANGUL_NO_SPECIAL_JAMO is set or not set for whole blocks.
   1410        for(UChar32 c = Hangul::HANGUL_BASE; c < Hangul::HANGUL_LIMIT;) {
   1411            uint32_t ce32 = base->getCE32(c);
   1412            U_ASSERT(Collation::hasCE32Tag(ce32, Collation::HANGUL_TAG));
   1413            UChar32 limit = c + Hangul::JAMO_VT_COUNT;
   1414            utrie2_setRange32(trie, c, limit - 1, ce32, true, &errorCode);
   1415            c = limit;
   1416        }
   1417    }
   1418 
   1419    setDigitTags(errorCode);
   1420    setLeadSurrogates(errorCode);
   1421 
   1422    if (icu4xMode) {
   1423        // Make sure that starters that occur is the middle of a
   1424        // contraction have contraction ce32 with the
   1425        // `CONTRACT_HAS_STARTER` flag set so that starters that
   1426        // can occur in a non-final position in a contraction can
   1427        // be easily recognized from having a contraction ce32
   1428        // that has the `CONTRACT_HAS_STARTER` flag set.
   1429 
   1430        UCharsTrieBuilder contractionBuilder(errorCode);
   1431        // Intentionally unpaired low surrogate to make it never
   1432        // match well-formed UTF-16 which ICU4X feeds to the
   1433        // matcher.
   1434        UnicodeString placeholder(0xDC00);
   1435 
   1436        for (UChar32 c : contractionMiddleStarter.codePoints()) {
   1437            uint32_t ce32 = utrie2_get32(trie, c);
   1438            UBool fromBase = false;
   1439            if(ce32 == Collation::FALLBACK_CE32) {
   1440                fromBase = true;
   1441                ce32 = base->getCE32(c);
   1442            }
   1443            if (!(Collation::hasCE32Tag(ce32, Collation::CONTRACTION_TAG) && (ce32 & Collation::CONTRACT_HAS_STARTER))) {
   1444                if (fromBase) {
   1445                    // This case does not actually happen as of February 2025.
   1446                    ce32 = copyFromBaseCE32(c, ce32, true, errorCode);
   1447                }
   1448                if (Collation::hasCE32Tag(ce32, Collation::CONTRACTION_TAG)) {
   1449                    // This middle starter is also the first character of another
   1450                    // contraction, but that contraction does not have the
   1451                    // CONTRACT_HAS_STARTER flag. Let's add the flag to
   1452                    // mark this at the expense of pessimizing the matching
   1453                    // of this contraction.
   1454                    // As of February 2025, this case does not actually occur
   1455                    // in CLDR.
   1456                    ce32 |= Collation::CONTRACT_HAS_STARTER;
   1457                } else {
   1458                    // This middle starter is not also the first character
   1459                    // in another contraction.
   1460 
   1461                    // The UCharsTrie needs to contain some placeholder
   1462                    // because it cannot be empty. We build a trie
   1463                    // that never actually matches anything that ICU4X can try to
   1464                    // match, since ICU4X always passes well-formed UTF-16 to the
   1465                    // matcher and we put an unpaired low surrogate into the trie.
   1466                    // This pessimizes the character to CE mapping of the `c`,
   1467                    // since useless trie matching will be attempted but as of
   1468                    // February 2025, only two relatively rare characters are affected.
   1469                    contractionBuilder.clear();
   1470                    contractionBuilder.add(placeholder, static_cast<int32_t>(ce32), errorCode);
   1471 
   1472                    int32_t index = addContextTrie(ce32, contractionBuilder, errorCode);
   1473                    if(U_FAILURE(errorCode)) { return; }
   1474                    if(index > Collation::MAX_INDEX) {
   1475                        errorCode = U_BUFFER_OVERFLOW_ERROR;
   1476                        return;
   1477                    }
   1478                    // Set CONTRACT_HAS_STARTER to make identical prefix matching able to catch this.
   1479                    ce32 = Collation::makeCE32FromTagAndIndex(Collation::CONTRACTION_TAG, index) | Collation::CONTRACT_HAS_STARTER;
   1480                }
   1481                utrie2_set32(trie, c, ce32, &errorCode);
   1482            }
   1483        }
   1484    } else {
   1485        // For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG.
   1486        ce32s.setElementAt(static_cast<int32_t>(utrie2_get32(trie, 0)), 0);
   1487        utrie2_set32(trie, 0, Collation::makeCE32FromTagAndIndex(Collation::U0000_TAG, 0), &errorCode);
   1488    }
   1489 
   1490    utrie2_freeze(trie, UTRIE2_32_VALUE_BITS, &errorCode);
   1491    if(U_FAILURE(errorCode)) { return; }
   1492 
   1493    // Mark each lead surrogate as "unsafe"
   1494    // if any of its 1024 associated supplementary code points is "unsafe".
   1495    UChar32 c = 0x10000;
   1496    for(char16_t lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
   1497        if(unsafeBackwardSet.containsSome(c, c + 0x3ff)) {
   1498            unsafeBackwardSet.add(lead);
   1499        }
   1500    }
   1501    unsafeBackwardSet.freeze();
   1502 
   1503    data.trie = trie;
   1504    data.ce32s = reinterpret_cast<const uint32_t *>(ce32s.getBuffer());
   1505    data.ces = ce64s.getBuffer();
   1506    data.contexts = contexts.getBuffer();
   1507 
   1508    data.ce32sLength = ce32s.size();
   1509    data.cesLength = ce64s.size();
   1510    data.contextsLength = contexts.length();
   1511 
   1512    data.base = base;
   1513    if(jamoIndex >= 0) {
   1514        data.jamoCE32s = data.ce32s + jamoIndex;
   1515    } else {
   1516        data.jamoCE32s = base->jamoCE32s;
   1517    }
   1518    data.unsafeBackwardSet = &unsafeBackwardSet;
   1519 }
   1520 
   1521 void
   1522 CollationDataBuilder::clearContexts() {
   1523    contexts.remove();
   1524    // Incrementing the contexts build "era" invalidates all of the builtCE32
   1525    // from before this clearContexts() call.
   1526    // Simpler than finding and resetting all of those fields.
   1527    ++contextsEra;
   1528 }
   1529 
   1530 void
   1531 CollationDataBuilder::buildContexts(UErrorCode &errorCode) {
   1532    if(U_FAILURE(errorCode)) { return; }
   1533    // Ignore abandoned lists and the cached builtCE32,
   1534    // and build all contexts from scratch.
   1535    clearContexts();
   1536    UnicodeSetIterator iter(contextChars);
   1537    while(U_SUCCESS(errorCode) && iter.next()) {
   1538        U_ASSERT(!iter.isString());
   1539        UChar32 c = iter.getCodepoint();
   1540        uint32_t ce32 = utrie2_get32(trie, c);
   1541        if(!isBuilderContextCE32(ce32)) {
   1542            // Impossible: No context data for c in contextChars.
   1543            errorCode = U_INTERNAL_PROGRAM_ERROR;
   1544            return;
   1545        }
   1546        ConditionalCE32 *cond = getConditionalCE32ForCE32(ce32);
   1547        ce32 = buildContext(cond, errorCode);
   1548        utrie2_set32(trie, c, ce32, &errorCode);
   1549    }
   1550 }
   1551 
   1552 uint32_t
   1553 CollationDataBuilder::buildContext(ConditionalCE32 *head, UErrorCode &errorCode) {
   1554    if(U_FAILURE(errorCode)) { return 0; }
   1555    // The list head must have no context.
   1556    U_ASSERT(!head->hasContext());
   1557    // The list head must be followed by one or more nodes that all do have context.
   1558    U_ASSERT(head->next >= 0);
   1559    UCharsTrieBuilder prefixBuilder(errorCode);
   1560    UCharsTrieBuilder contractionBuilder(errorCode);
   1561    // This outer loop goes from each prefix to the next.
   1562    // For each prefix it finds the one or more same-prefix entries (firstCond..lastCond).
   1563    // If there are multiple suffixes for the same prefix,
   1564    // then an inner loop builds a contraction trie for them.
   1565    for(ConditionalCE32 *cond = head;; cond = getConditionalCE32(cond->next)) {
   1566        if(U_FAILURE(errorCode)) { return 0; }  // early out for memory allocation errors
   1567        // After the list head, the prefix or suffix can be empty, but not both.
   1568        U_ASSERT(cond == head || cond->hasContext());
   1569        int32_t prefixLength = cond->prefixLength();
   1570        UnicodeString prefix(cond->context, 0, prefixLength + 1);
   1571        // Collect all contraction suffixes for one prefix.
   1572        ConditionalCE32 *firstCond = cond;
   1573        ConditionalCE32 *lastCond;
   1574        do {
   1575            lastCond = cond;
   1576            // Clear the defaultCE32 fields as we go.
   1577            // They are left over from building a previous version of this list of contexts.
   1578            //
   1579            // One of the code paths below may copy a preceding defaultCE32
   1580            // into its emptySuffixCE32.
   1581            // If a new suffix has been inserted before what used to be
   1582            // the firstCond for its prefix, then that previous firstCond could still
   1583            // contain an outdated defaultCE32 from an earlier buildContext() and
   1584            // result in an incorrect emptySuffixCE32.
   1585            // So we reset all defaultCE32 before reading and setting new values.
   1586            cond->defaultCE32 = Collation::NO_CE32;
   1587        } while(cond->next >= 0 &&
   1588                (cond = getConditionalCE32(cond->next))->context.startsWith(prefix));
   1589        uint32_t ce32;
   1590        int32_t suffixStart = prefixLength + 1;  // == prefix.length()
   1591        if(lastCond->context.length() == suffixStart) {
   1592            // One prefix without contraction suffix.
   1593            U_ASSERT(firstCond == lastCond);
   1594            ce32 = lastCond->ce32;
   1595            cond = lastCond;
   1596        } else {
   1597            // Build the contractions trie.
   1598            contractionBuilder.clear();
   1599            // Entry for an empty suffix, to be stored before the trie.
   1600            uint32_t emptySuffixCE32 = 0;
   1601            uint32_t flags = 0;
   1602            if(firstCond->context.length() == suffixStart) {
   1603                // There is a mapping for the prefix and the single character c. (p|c)
   1604                // If no other suffix matches, then we return this value.
   1605                emptySuffixCE32 = firstCond->ce32;
   1606                cond = getConditionalCE32(firstCond->next);
   1607            } else {
   1608                // There is no mapping for the prefix and just the single character.
   1609                // (There is no p|c, only p|cd, p|ce etc.)
   1610                flags |= Collation::CONTRACT_SINGLE_CP_NO_MATCH;
   1611                // When the prefix matches but none of the prefix-specific suffixes,
   1612                // then we fall back to the mappings with the next-longest prefix,
   1613                // and ultimately to mappings with no prefix.
   1614                // Each fallback might be another set of contractions.
   1615                // For example, if there are mappings for ch, p|cd, p|ce, but not for p|c,
   1616                // then in text "pch" we find the ch contraction.
   1617                for(cond = head;; cond = getConditionalCE32(cond->next)) {
   1618                    int32_t length = cond->prefixLength();
   1619                    if(length == prefixLength) { break; }
   1620                    if(cond->defaultCE32 != Collation::NO_CE32 &&
   1621                            (length==0 || prefix.endsWith(cond->context, 1, length))) {
   1622                        emptySuffixCE32 = cond->defaultCE32;
   1623                    }
   1624                }
   1625                cond = firstCond;
   1626            }
   1627            // Optimization: Set a flag when
   1628            // the first character of every contraction suffix has lccc!=0.
   1629            // Short-circuits contraction matching when a normal letter follows.
   1630            flags |= Collation::CONTRACT_NEXT_CCC;
   1631            // Add all of the non-empty suffixes into the contraction trie.
   1632            for(;;) {
   1633                UnicodeString suffix(cond->context, suffixStart);
   1634                uint16_t fcd16 = nfcImpl.getFCD16(suffix.char32At(0));
   1635                if(fcd16 <= 0xff) {
   1636                    flags &= ~Collation::CONTRACT_NEXT_CCC;
   1637                }
   1638                fcd16 = nfcImpl.getFCD16(suffix.char32At(suffix.length() - 1));
   1639                if(fcd16 > 0xff) {
   1640                    // The last suffix character has lccc!=0, allowing for discontiguous contractions.
   1641                    flags |= Collation::CONTRACT_TRAILING_CCC;
   1642                }
   1643                if (icu4xMode && (flags & Collation::CONTRACT_HAS_STARTER) == 0) {
   1644                    for (int32_t i = 0; i < suffix.length();) {
   1645                        UChar32 c = suffix.char32At(i);
   1646                            if (!u_getCombiningClass(c)) {
   1647                                flags |= Collation::CONTRACT_HAS_STARTER;
   1648                                break;
   1649                            }
   1650                        if (c > 0xFFFF) {
   1651                            i += 2;
   1652                        } else {
   1653                            ++i;
   1654                        }
   1655                    }
   1656                }
   1657                contractionBuilder.add(suffix, static_cast<int32_t>(cond->ce32), errorCode);
   1658                if(cond == lastCond) { break; }
   1659                cond = getConditionalCE32(cond->next);
   1660            }
   1661            int32_t index = addContextTrie(emptySuffixCE32, contractionBuilder, errorCode);
   1662            if(U_FAILURE(errorCode)) { return 0; }
   1663            if(index > Collation::MAX_INDEX) {
   1664                errorCode = U_BUFFER_OVERFLOW_ERROR;
   1665                return 0;
   1666            }
   1667            ce32 = Collation::makeCE32FromTagAndIndex(Collation::CONTRACTION_TAG, index) | flags;
   1668        }
   1669        U_ASSERT(cond == lastCond);
   1670        firstCond->defaultCE32 = ce32;
   1671        if(prefixLength == 0) {
   1672            if(cond->next < 0) {
   1673                // No non-empty prefixes, only contractions.
   1674                return ce32;
   1675            }
   1676        } else {
   1677            prefix.remove(0, 1);  // Remove the length unit.
   1678            prefix.reverse();
   1679            prefixBuilder.add(prefix, static_cast<int32_t>(ce32), errorCode);
   1680            if(cond->next < 0) { break; }
   1681        }
   1682    }
   1683    U_ASSERT(head->defaultCE32 != Collation::NO_CE32);
   1684    int32_t index = addContextTrie(head->defaultCE32, prefixBuilder, errorCode);
   1685    if(U_FAILURE(errorCode)) { return 0; }
   1686    if(index > Collation::MAX_INDEX) {
   1687        errorCode = U_BUFFER_OVERFLOW_ERROR;
   1688        return 0;
   1689    }
   1690    return Collation::makeCE32FromTagAndIndex(Collation::PREFIX_TAG, index);
   1691 }
   1692 
   1693 int32_t
   1694 CollationDataBuilder::addContextTrie(uint32_t defaultCE32, UCharsTrieBuilder &trieBuilder,
   1695                                     UErrorCode &errorCode) {
   1696    UnicodeString context;
   1697    context.append(static_cast<char16_t>(defaultCE32 >> 16)).append(static_cast<char16_t>(defaultCE32));
   1698    UnicodeString trieString;
   1699    context.append(trieBuilder.buildUnicodeString(USTRINGTRIE_BUILD_SMALL, trieString, errorCode));
   1700    if(U_FAILURE(errorCode)) { return -1; }
   1701    int32_t index = contexts.indexOf(context);
   1702    if(index < 0) {
   1703        index = contexts.length();
   1704        contexts.append(context);
   1705    }
   1706    return index;
   1707 }
   1708 
   1709 void
   1710 CollationDataBuilder::buildFastLatinTable(CollationData &data, UErrorCode &errorCode) {
   1711    if(U_FAILURE(errorCode) || !fastLatinEnabled) { return; }
   1712 
   1713    delete fastLatinBuilder;
   1714    fastLatinBuilder = new CollationFastLatinBuilder(errorCode);
   1715    if(fastLatinBuilder == nullptr) {
   1716        errorCode = U_MEMORY_ALLOCATION_ERROR;
   1717        return;
   1718    }
   1719    if(fastLatinBuilder->forData(data, errorCode)) {
   1720        const uint16_t *table = fastLatinBuilder->getTable();
   1721        int32_t length = fastLatinBuilder->lengthOfTable();
   1722        if(base != nullptr && length == base->fastLatinTableLength &&
   1723                uprv_memcmp(table, base->fastLatinTable, length * 2) == 0) {
   1724            // Same fast Latin table as in the base, use that one instead.
   1725            delete fastLatinBuilder;
   1726            fastLatinBuilder = nullptr;
   1727            table = base->fastLatinTable;
   1728        }
   1729        data.fastLatinTable = table;
   1730        data.fastLatinTableLength = length;
   1731    } else {
   1732        delete fastLatinBuilder;
   1733        fastLatinBuilder = nullptr;
   1734    }
   1735 }
   1736 
   1737 int32_t
   1738 CollationDataBuilder::getCEs(const UnicodeString &s, int64_t ces[], int32_t cesLength) {
   1739    return getCEs(s, 0, ces, cesLength);
   1740 }
   1741 
   1742 int32_t
   1743 CollationDataBuilder::getCEs(const UnicodeString &prefix, const UnicodeString &s,
   1744                             int64_t ces[], int32_t cesLength) {
   1745    int32_t prefixLength = prefix.length();
   1746    if(prefixLength == 0) {
   1747        return getCEs(s, 0, ces, cesLength);
   1748    } else {
   1749        return getCEs(prefix + s, prefixLength, ces, cesLength);
   1750    }
   1751 }
   1752 
   1753 int32_t
   1754 CollationDataBuilder::getCEs(const UnicodeString &s, int32_t start,
   1755                             int64_t ces[], int32_t cesLength) {
   1756    if(collIter == nullptr) {
   1757        collIter = new DataBuilderCollationIterator(*this);
   1758        if(collIter == nullptr) { return 0; }
   1759    }
   1760    return collIter->fetchCEs(s, start, ces, cesLength);
   1761 }
   1762 
   1763 U_NAMESPACE_END
   1764 
   1765 #endif  // !UCONFIG_NO_COLLATION