collationdatabuilder.cpp (72338B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2012-2015, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * collationdatabuilder.cpp 9 * 10 * (replaced the former ucol_elm.cpp) 11 * 12 * created on: 2012apr01 13 * created by: Markus W. Scherer 14 */ 15 16 #include "unicode/utypes.h" 17 18 #if !UCONFIG_NO_COLLATION 19 20 #include "unicode/localpointer.h" 21 #include "unicode/uchar.h" 22 #include "unicode/ucharstrie.h" 23 #include "unicode/ucharstriebuilder.h" 24 #include "unicode/uniset.h" 25 #include "unicode/unistr.h" 26 #include "unicode/usetiter.h" 27 #include "unicode/utf16.h" 28 #include "cmemory.h" 29 #include "collation.h" 30 #include "collationdata.h" 31 #include "collationdatabuilder.h" 32 #include "collationfastlatinbuilder.h" 33 #include "collationiterator.h" 34 #include "normalizer2impl.h" 35 #include "utrie2.h" 36 #include "uvectr32.h" 37 #include "uvectr64.h" 38 #include "uvector.h" 39 40 U_NAMESPACE_BEGIN 41 42 CollationDataBuilder::CEModifier::~CEModifier() {} 43 44 /** 45 * Build-time context and CE32 for a code point. 46 * If a code point has contextual mappings, then the default (no-context) mapping 47 * and all conditional mappings are stored in a singly-linked list 48 * of ConditionalCE32, sorted by context strings. 49 * 50 * Context strings sort by prefix length, then by prefix, then by contraction suffix. 51 * Context strings must be unique and in ascending order. 52 */ 53 struct ConditionalCE32 : public UMemory { 54 ConditionalCE32() 55 : context(), 56 ce32(0), defaultCE32(Collation::NO_CE32), builtCE32(Collation::NO_CE32), 57 next(-1) {} 58 ConditionalCE32(const UnicodeString &ct, uint32_t ce) 59 : context(ct), 60 ce32(ce), defaultCE32(Collation::NO_CE32), builtCE32(Collation::NO_CE32), 61 next(-1) {} 62 63 inline UBool hasContext() const { return context.length() > 1; } 64 inline int32_t prefixLength() const { return context.charAt(0); } 65 66 /** 67 * "\0" for the first entry for any code point, with its default CE32. 68 * 69 * Otherwise one unit with the length of the prefix string, 70 * then the prefix string, then the contraction suffix. 71 */ 72 UnicodeString context; 73 /** 74 * CE32 for the code point and its context. 75 * Can be special (e.g., for an expansion) but not contextual (prefix or contraction tag). 76 */ 77 uint32_t ce32; 78 /** 79 * Default CE32 for all contexts with this same prefix. 80 * Initially NO_CE32. Set only while building runtime data structures, 81 * and only on one of the nodes of a sub-list with the same prefix. 82 */ 83 uint32_t defaultCE32; 84 /** 85 * CE32 for the built contexts. 86 * When fetching CEs from the builder, the contexts are built into their runtime form 87 * so that the normal collation implementation can process them. 88 * The result is cached in the list head. It is reset when the contexts are modified. 89 * All of these builtCE32 are invalidated by clearContexts(), 90 * via incrementing the contextsEra. 91 */ 92 uint32_t builtCE32; 93 /** 94 * The "era" of building intermediate contexts when the above builtCE32 was set. 95 * When the array of cached, temporary contexts overflows, then clearContexts() 96 * removes them all and invalidates the builtCE32 that used to point to built tries. 97 */ 98 int32_t era = -1; 99 /** 100 * Index of the next ConditionalCE32. 101 * Negative for the end of the list. 102 */ 103 int32_t next; 104 // Note: We could create a separate class for all of the contextual mappings for 105 // a code point, with the builtCE32, the era, and a list of the actual mappings. 106 // The class that represents one mapping would then not need to 107 // store those fields in each element. 108 }; 109 110 U_CDECL_BEGIN 111 112 void U_CALLCONV 113 uprv_deleteConditionalCE32(void *obj) { 114 delete static_cast<ConditionalCE32 *>(obj); 115 } 116 117 U_CDECL_END 118 119 /** 120 * Build-time collation element and character iterator. 121 * Uses the runtime CollationIterator for fetching CEs for a string 122 * but reads from the builder's unfinished data structures. 123 * In particular, this class reads from the unfinished trie 124 * and has to avoid CollationIterator::nextCE() and redirect other 125 * calls to data->getCE32() and data->getCE32FromSupplementary(). 126 * 127 * We do this so that we need not implement the collation algorithm 128 * again for the builder and make it behave exactly like the runtime code. 129 * That would be more difficult to test and maintain than this indirection. 130 * 131 * Some CE32 tags (for example, the DIGIT_TAG) do not occur in the builder data, 132 * so the data accesses from those code paths need not be modified. 133 * 134 * This class iterates directly over whole code points 135 * so that the CollationIterator does not need the finished trie 136 * for handling the LEAD_SURROGATE_TAG. 137 */ 138 class DataBuilderCollationIterator : public CollationIterator { 139 public: 140 DataBuilderCollationIterator(CollationDataBuilder &b); 141 142 virtual ~DataBuilderCollationIterator(); 143 144 int32_t fetchCEs(const UnicodeString &str, int32_t start, int64_t ces[], int32_t cesLength); 145 146 virtual void resetToOffset(int32_t newOffset) override; 147 virtual int32_t getOffset() const override; 148 149 virtual UChar32 nextCodePoint(UErrorCode &errorCode) override; 150 virtual UChar32 previousCodePoint(UErrorCode &errorCode) override; 151 152 protected: 153 virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode) override; 154 virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode) override; 155 156 virtual uint32_t getDataCE32(UChar32 c) const override; 157 virtual uint32_t getCE32FromBuilderData(uint32_t ce32, UErrorCode &errorCode) override; 158 159 CollationDataBuilder &builder; 160 CollationData builderData; 161 uint32_t jamoCE32s[CollationData::JAMO_CE32S_LENGTH]; 162 const UnicodeString *s; 163 int32_t pos; 164 }; 165 166 DataBuilderCollationIterator::DataBuilderCollationIterator(CollationDataBuilder &b) 167 : CollationIterator(&builderData, /*numeric=*/ false), 168 builder(b), builderData(b.nfcImpl), 169 s(nullptr), pos(0) { 170 builderData.base = builder.base; 171 // Set all of the jamoCE32s[] to indirection CE32s. 172 for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) { // Count across Jamo types. 173 UChar32 jamo = CollationDataBuilder::jamoCpFromIndex(j); 174 jamoCE32s[j] = Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG, jamo) | 175 CollationDataBuilder::IS_BUILDER_JAMO_CE32; 176 } 177 builderData.jamoCE32s = jamoCE32s; 178 } 179 180 DataBuilderCollationIterator::~DataBuilderCollationIterator() {} 181 182 int32_t 183 DataBuilderCollationIterator::fetchCEs(const UnicodeString &str, int32_t start, 184 int64_t ces[], int32_t cesLength) { 185 // Set the pointers each time, in case they changed due to reallocation. 186 builderData.ce32s = reinterpret_cast<const uint32_t *>(builder.ce32s.getBuffer()); 187 builderData.ces = builder.ce64s.getBuffer(); 188 builderData.contexts = builder.contexts.getBuffer(); 189 // Modified copy of CollationIterator::nextCE() and CollationIterator::nextCEFromCE32(). 190 reset(); 191 s = &str; 192 pos = start; 193 UErrorCode errorCode = U_ZERO_ERROR; 194 while(U_SUCCESS(errorCode) && pos < s->length()) { 195 // No need to keep all CEs in the iterator buffer. 196 clearCEs(); 197 UChar32 c = s->char32At(pos); 198 pos += U16_LENGTH(c); 199 uint32_t ce32 = utrie2_get32(builder.trie, c); 200 const CollationData *d; 201 if(ce32 == Collation::FALLBACK_CE32) { 202 d = builder.base; 203 ce32 = builder.base->getCE32(c); 204 } else { 205 d = &builderData; 206 } 207 appendCEsFromCE32(d, c, ce32, /*forward=*/ true, errorCode); 208 U_ASSERT(U_SUCCESS(errorCode)); 209 for(int32_t i = 0; i < getCEsLength(); ++i) { 210 int64_t ce = getCE(i); 211 if(ce != 0) { 212 if(cesLength < Collation::MAX_EXPANSION_LENGTH) { 213 ces[cesLength] = ce; 214 } 215 ++cesLength; 216 } 217 } 218 } 219 return cesLength; 220 } 221 222 void 223 DataBuilderCollationIterator::resetToOffset(int32_t newOffset) { 224 reset(); 225 pos = newOffset; 226 } 227 228 int32_t 229 DataBuilderCollationIterator::getOffset() const { 230 return pos; 231 } 232 233 UChar32 234 DataBuilderCollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) { 235 if(pos == s->length()) { 236 return U_SENTINEL; 237 } 238 UChar32 c = s->char32At(pos); 239 pos += U16_LENGTH(c); 240 return c; 241 } 242 243 UChar32 244 DataBuilderCollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) { 245 if(pos == 0) { 246 return U_SENTINEL; 247 } 248 UChar32 c = s->char32At(pos - 1); 249 pos -= U16_LENGTH(c); 250 return c; 251 } 252 253 void 254 DataBuilderCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { 255 pos = s->moveIndex32(pos, num); 256 } 257 258 void 259 DataBuilderCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { 260 pos = s->moveIndex32(pos, -num); 261 } 262 263 uint32_t 264 DataBuilderCollationIterator::getDataCE32(UChar32 c) const { 265 return utrie2_get32(builder.trie, c); 266 } 267 268 uint32_t 269 DataBuilderCollationIterator::getCE32FromBuilderData(uint32_t ce32, UErrorCode &errorCode) { 270 if (U_FAILURE(errorCode)) { return 0; } 271 U_ASSERT(Collation::hasCE32Tag(ce32, Collation::BUILDER_DATA_TAG)); 272 if((ce32 & CollationDataBuilder::IS_BUILDER_JAMO_CE32) != 0) { 273 UChar32 jamo = Collation::indexFromCE32(ce32); 274 return utrie2_get32(builder.trie, jamo); 275 } else { 276 ConditionalCE32 *cond = builder.getConditionalCE32ForCE32(ce32); 277 if (cond == nullptr) { 278 errorCode = U_INTERNAL_PROGRAM_ERROR; 279 // TODO: ICU-21531 figure out why this happens. 280 return 0; 281 } 282 if(cond->builtCE32 == Collation::NO_CE32 || cond->era != builder.contextsEra) { 283 // Build the context-sensitive mappings into their runtime form and cache the result. 284 cond->builtCE32 = builder.buildContext(cond, errorCode); 285 if(errorCode == U_BUFFER_OVERFLOW_ERROR) { 286 errorCode = U_ZERO_ERROR; 287 builder.clearContexts(); 288 cond->builtCE32 = builder.buildContext(cond, errorCode); 289 } 290 cond->era = builder.contextsEra; 291 builderData.contexts = builder.contexts.getBuffer(); 292 } 293 return cond->builtCE32; 294 } 295 } 296 297 // ------------------------------------------------------------------------- *** 298 299 CollationDataBuilder::CollationDataBuilder(UBool icu4xMode, UErrorCode &errorCode) 300 : nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)), 301 base(nullptr), baseSettings(nullptr), 302 trie(nullptr), 303 ce32s(errorCode), ce64s(errorCode), conditionalCE32s(errorCode), 304 modified(false), 305 icu4xMode(icu4xMode), 306 fastLatinEnabled(false), fastLatinBuilder(nullptr), 307 collIter(nullptr) { 308 // Reserve the first CE32 for U+0000. 309 if (!icu4xMode) { 310 ce32s.addElement(0, errorCode); 311 } 312 conditionalCE32s.setDeleter(uprv_deleteConditionalCE32); 313 } 314 315 CollationDataBuilder::~CollationDataBuilder() { 316 utrie2_close(trie); 317 delete fastLatinBuilder; 318 delete collIter; 319 } 320 321 void 322 CollationDataBuilder::initForTailoring(const CollationData *b, UErrorCode &errorCode) { 323 if(U_FAILURE(errorCode)) { return; } 324 if(trie != nullptr) { 325 errorCode = U_INVALID_STATE_ERROR; 326 return; 327 } 328 if(b == nullptr) { 329 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 330 return; 331 } 332 base = b; 333 334 // For a tailoring, the default is to fall back to the base. 335 // For ICU4X, use the same value for fallback as for the default 336 // to avoid having to have different blocks for the two. 337 trie = utrie2_open(Collation::FALLBACK_CE32, icu4xMode ? Collation::FALLBACK_CE32 : Collation::FFFD_CE32, &errorCode); 338 339 if (!icu4xMode) { 340 // Set the Latin-1 letters block so that it is allocated first in the data array, 341 // to try to improve locality of reference when sorting Latin-1 text. 342 // Do not use utrie2_setRange32() since that will not actually allocate blocks 343 // that are filled with the default value. 344 // ASCII (0..7F) is already preallocated anyway. 345 for(UChar32 c = 0xc0; c <= 0xff; ++c) { 346 utrie2_set32(trie, c, Collation::FALLBACK_CE32, &errorCode); 347 } 348 349 // Hangul syllables are not tailorable (except via tailoring Jamos). 350 // Always set the Hangul tag to help performance. 351 // Do this here, rather than in buildMappings(), 352 // so that we see the HANGUL_TAG in various assertions. 353 uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0); 354 utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, true, &errorCode); 355 356 // Copy the set contents but don't copy/clone the set as a whole because 357 // that would copy the isFrozen state too. 358 unsafeBackwardSet.addAll(*b->unsafeBackwardSet); 359 } 360 361 if(U_FAILURE(errorCode)) { return; } 362 } 363 364 UBool 365 CollationDataBuilder::maybeSetPrimaryRange(UChar32 start, UChar32 end, 366 uint32_t primary, int32_t step, 367 UErrorCode &errorCode) { 368 if(U_FAILURE(errorCode)) { return false; } 369 U_ASSERT(start <= end); 370 // TODO: Do we need to check what values are currently set for start..end? 371 // An offset range is worth it only if we can achieve an overlap between 372 // adjacent UTrie2 blocks of 32 code points each. 373 // An offset CE is also a little more expensive to look up and compute 374 // than a simple CE. 375 // If the range spans at least three UTrie2 block boundaries (> 64 code points), 376 // then we take it. 377 // If the range spans one or two block boundaries and there are 378 // at least 4 code points on either side, then we take it. 379 // (We could additionally require a minimum range length of, say, 16.) 380 int32_t blockDelta = (end >> 5) - (start >> 5); 381 if(2 <= step && step <= 0x7f && 382 (blockDelta >= 3 || 383 (blockDelta > 0 && (start & 0x1f) <= 0x1c && (end & 0x1f) >= 3))) { 384 int64_t dataCE = (static_cast<int64_t>(primary) << 32) | (start << 8) | step; 385 if(isCompressiblePrimary(primary)) { dataCE |= 0x80; } 386 int32_t index = addCE(dataCE, errorCode); 387 if(U_FAILURE(errorCode)) { return 0; } 388 if(index > Collation::MAX_INDEX) { 389 errorCode = U_BUFFER_OVERFLOW_ERROR; 390 return 0; 391 } 392 uint32_t offsetCE32 = Collation::makeCE32FromTagAndIndex(Collation::OFFSET_TAG, index); 393 utrie2_setRange32(trie, start, end, offsetCE32, true, &errorCode); 394 modified = true; 395 return true; 396 } else { 397 return false; 398 } 399 } 400 401 uint32_t 402 CollationDataBuilder::setPrimaryRangeAndReturnNext(UChar32 start, UChar32 end, 403 uint32_t primary, int32_t step, 404 UErrorCode &errorCode) { 405 if(U_FAILURE(errorCode)) { return 0; } 406 UBool isCompressible = isCompressiblePrimary(primary); 407 if(maybeSetPrimaryRange(start, end, primary, step, errorCode)) { 408 return Collation::incThreeBytePrimaryByOffset(primary, isCompressible, 409 (end - start + 1) * step); 410 } else { 411 // Short range: Set individual CE32s. 412 for(;;) { 413 utrie2_set32(trie, start, Collation::makeLongPrimaryCE32(primary), &errorCode); 414 ++start; 415 primary = Collation::incThreeBytePrimaryByOffset(primary, isCompressible, step); 416 if(start > end) { return primary; } 417 } 418 modified = true; 419 } 420 } 421 422 uint32_t 423 CollationDataBuilder::getCE32FromOffsetCE32(UBool fromBase, UChar32 c, uint32_t ce32) const { 424 int32_t i = Collation::indexFromCE32(ce32); 425 int64_t dataCE = fromBase ? base->ces[i] : ce64s.elementAti(i); 426 uint32_t p = Collation::getThreeBytePrimaryForOffsetData(c, dataCE); 427 return Collation::makeLongPrimaryCE32(p); 428 } 429 430 UBool 431 CollationDataBuilder::isCompressibleLeadByte(uint32_t b) const { 432 return base->isCompressibleLeadByte(b); 433 } 434 435 UBool 436 CollationDataBuilder::isAssigned(UChar32 c) const { 437 return Collation::isAssignedCE32(utrie2_get32(trie, c)); 438 } 439 440 uint32_t 441 CollationDataBuilder::getLongPrimaryIfSingleCE(UChar32 c) const { 442 uint32_t ce32 = utrie2_get32(trie, c); 443 if(Collation::isLongPrimaryCE32(ce32)) { 444 return Collation::primaryFromLongPrimaryCE32(ce32); 445 } else { 446 return 0; 447 } 448 } 449 450 int64_t 451 CollationDataBuilder::getSingleCE(UChar32 c, UErrorCode &errorCode) const { 452 if(U_FAILURE(errorCode)) { return 0; } 453 // Keep parallel with CollationData::getSingleCE(). 454 UBool fromBase = false; 455 uint32_t ce32 = utrie2_get32(trie, c); 456 if(ce32 == Collation::FALLBACK_CE32) { 457 fromBase = true; 458 ce32 = base->getCE32(c); 459 } 460 while(Collation::isSpecialCE32(ce32)) { 461 switch(Collation::tagFromCE32(ce32)) { 462 case Collation::LATIN_EXPANSION_TAG: 463 case Collation::BUILDER_DATA_TAG: 464 case Collation::PREFIX_TAG: 465 case Collation::CONTRACTION_TAG: 466 case Collation::HANGUL_TAG: 467 case Collation::LEAD_SURROGATE_TAG: 468 errorCode = U_UNSUPPORTED_ERROR; 469 return 0; 470 case Collation::FALLBACK_TAG: 471 case Collation::RESERVED_TAG_3: 472 errorCode = U_INTERNAL_PROGRAM_ERROR; 473 return 0; 474 case Collation::LONG_PRIMARY_TAG: 475 return Collation::ceFromLongPrimaryCE32(ce32); 476 case Collation::LONG_SECONDARY_TAG: 477 return Collation::ceFromLongSecondaryCE32(ce32); 478 case Collation::EXPANSION32_TAG: 479 if(Collation::lengthFromCE32(ce32) == 1) { 480 int32_t i = Collation::indexFromCE32(ce32); 481 ce32 = fromBase ? base->ce32s[i] : ce32s.elementAti(i); 482 break; 483 } else { 484 errorCode = U_UNSUPPORTED_ERROR; 485 return 0; 486 } 487 case Collation::EXPANSION_TAG: { 488 if(Collation::lengthFromCE32(ce32) == 1) { 489 int32_t i = Collation::indexFromCE32(ce32); 490 return fromBase ? base->ces[i] : ce64s.elementAti(i); 491 } else { 492 errorCode = U_UNSUPPORTED_ERROR; 493 return 0; 494 } 495 } 496 case Collation::DIGIT_TAG: 497 // Fetch the non-numeric-collation CE32 and continue. 498 ce32 = ce32s.elementAti(Collation::indexFromCE32(ce32)); 499 break; 500 case Collation::U0000_TAG: 501 U_ASSERT(c == 0); 502 // Fetch the normal ce32 for U+0000 and continue. 503 ce32 = fromBase ? base->ce32s[0] : ce32s.elementAti(0); 504 break; 505 case Collation::OFFSET_TAG: 506 ce32 = getCE32FromOffsetCE32(fromBase, c, ce32); 507 break; 508 case Collation::IMPLICIT_TAG: 509 return Collation::unassignedCEFromCodePoint(c); 510 } 511 } 512 return Collation::ceFromSimpleCE32(ce32); 513 } 514 515 int32_t 516 CollationDataBuilder::addCE(int64_t ce, UErrorCode &errorCode) { 517 int32_t length = ce64s.size(); 518 for(int32_t i = 0; i < length; ++i) { 519 if(ce == ce64s.elementAti(i)) { return i; } 520 } 521 ce64s.addElement(ce, errorCode); 522 return length; 523 } 524 525 int32_t 526 CollationDataBuilder::addCE32(uint32_t ce32, UErrorCode &errorCode) { 527 int32_t length = ce32s.size(); 528 for(int32_t i = 0; i < length; ++i) { 529 if (ce32 == static_cast<uint32_t>(ce32s.elementAti(i))) { return i; } 530 } 531 ce32s.addElement(static_cast<int32_t>(ce32), errorCode); 532 return length; 533 } 534 535 int32_t 536 CollationDataBuilder::addConditionalCE32(const UnicodeString &context, uint32_t ce32, 537 UErrorCode &errorCode) { 538 if(U_FAILURE(errorCode)) { return -1; } 539 U_ASSERT(!context.isEmpty()); 540 int32_t index = conditionalCE32s.size(); 541 if(index > Collation::MAX_INDEX) { 542 errorCode = U_BUFFER_OVERFLOW_ERROR; 543 return -1; 544 } 545 LocalPointer<ConditionalCE32> cond(new ConditionalCE32(context, ce32), errorCode); 546 conditionalCE32s.adoptElement(cond.orphan(), errorCode); 547 if(U_FAILURE(errorCode)) { 548 return -1; 549 } 550 return index; 551 } 552 553 void 554 CollationDataBuilder::add(const UnicodeString &prefix, const UnicodeString &s, 555 const int64_t ces[], int32_t cesLength, 556 UErrorCode &errorCode) { 557 uint32_t ce32 = encodeCEs(ces, cesLength, errorCode); 558 addCE32(prefix, s, ce32, errorCode); 559 } 560 561 void 562 CollationDataBuilder::addCE32(const UnicodeString &prefix, const UnicodeString &s, 563 uint32_t ce32, UErrorCode &errorCode) { 564 if(U_FAILURE(errorCode)) { return; } 565 if(s.isEmpty()) { 566 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 567 return; 568 } 569 if(trie == nullptr || utrie2_isFrozen(trie)) { 570 errorCode = U_INVALID_STATE_ERROR; 571 return; 572 } 573 UChar32 c = s.char32At(0); 574 int32_t cLength = U16_LENGTH(c); 575 uint32_t oldCE32 = utrie2_get32(trie, c); 576 UBool hasContext = !prefix.isEmpty() || s.length() > cLength; 577 578 if (icu4xMode) { 579 if (base && c >= 0x1100 && c < 0x1200) { 580 // Omit jamo tailorings. 581 // TODO(https://github.com/unicode-org/icu4x/issues/1941). 582 } 583 const Normalizer2* nfdNormalizer = Normalizer2::getNFDInstance(errorCode); 584 UnicodeString sInNfd; 585 nfdNormalizer->normalize(s, sInNfd, errorCode); 586 if (s != sInNfd) { 587 // s is not in NFD, so it cannot match in ICU4X, since ICU4X only 588 // does NFD lookups. 589 590 // As of Unicode 16 alpha, the cases that come here are: 591 // 592 // 1. The second character is a special decomposing Tibetan vowel 593 // sign. These are OK to ignore in the precomposed form, since 594 // the decomposed form is added also. 595 // 2. Likewise for KIRAT RAI VOWEL SIGN AA followed by KIRAT RAI VOWEL SIGN AI 596 // and other such cases. 597 // For details see the normalization section of 598 // https://www.unicode.org/review/pri497/pri497-background.html 599 // 3. U+FDD1 followed by U+AC00 is a marker for the alphabetical 600 // index feature of ICU4C, which at this time does not have 601 // a counterpart in ICU4X. 602 return; 603 } 604 605 if (!prefix.isEmpty()) { 606 UnicodeString prefixInNfd; 607 nfdNormalizer->normalize(prefix, prefixInNfd, errorCode); 608 if (prefix != prefixInNfd) { 609 errorCode = U_UNSUPPORTED_ERROR; 610 return; 611 } 612 613 int32_t count = prefix.countChar32(); 614 if (count > 2) { 615 // Prefix too long for ICU4X. 616 errorCode = U_UNSUPPORTED_ERROR; 617 return; 618 } 619 UChar32 utf32[4]; 620 int32_t len = prefix.toUTF32(utf32, 4, errorCode); 621 if (len != count) { 622 errorCode = U_INVALID_STATE_ERROR; 623 return; 624 } 625 UChar32 c = utf32[0]; 626 if (u_getCombiningClass(c)) { 627 // Prefix must start with as starter for ICU4X. 628 errorCode = U_UNSUPPORTED_ERROR; 629 return; 630 } 631 // XXX: Korean searchjl has jamo in prefix, so commenting out this 632 // check for now. ICU4X currently ignores non-root jamo tables anyway. 633 // searchjl was added in 634 // https://unicode-org.atlassian.net/browse/CLDR-3560 635 // Contractions were changed to prefixes in 636 // https://unicode-org.atlassian.net/browse/CLDR-6546 637 // 638 // if ((c >= 0x1100 && c < 0x1200) || (c >= 0xAC00 && c < 0xD7A4)) { 639 // errorCode = U_UNSUPPORTED_ERROR; 640 // return; 641 // } 642 if ((len > 1) && !(utf32[1] == 0x3099 || utf32[1] == 0x309A)) { 643 // Second character in prefix, if present, must be a kana voicing mark for ICU4X. 644 errorCode = U_UNSUPPORTED_ERROR; 645 return; 646 } 647 } 648 649 if (s.length() > cLength) { 650 // Check that there's no modern Hangul in contractions. 651 for (int32_t i = 0; i < s.length(); ++i) { 652 char16_t c = s.charAt(i); 653 if ((c >= 0x1100 && c < 0x1100 + 19) || (c >= 0x1161 && c < 0x1161 + 21) || (c >= 0x11A7 && c < 0x11A7 + 28) || (c >= 0xAC00 && c < 0xD7A4)) { 654 errorCode = U_UNSUPPORTED_ERROR; 655 return; 656 } 657 } 658 int32_t sCount = s.countChar32(); 659 UChar32 sUtf32[32]; 660 int32_t sLen = s.toUTF32(sUtf32, 32, errorCode); 661 if (sLen != sCount) { 662 // If this error is ever reached, just increase the buffer 663 // size above. 664 errorCode = U_UNSUPPORTED_ERROR; 665 return; 666 } 667 for (int32_t i = 1; i < sLen - 1; ++i) { 668 if (u_getCombiningClass(sUtf32[i]) == 0) { 669 contractionMiddleStarter.add(sUtf32[i]); 670 } 671 } 672 } 673 } 674 675 if(oldCE32 == Collation::FALLBACK_CE32) { 676 // First tailoring for c. 677 // If c has contextual base mappings or if we add a contextual mapping, 678 // then copy the base mappings. 679 // Otherwise we just override the base mapping. 680 uint32_t baseCE32 = base->getFinalCE32(base->getCE32(c)); 681 if(hasContext || Collation::ce32HasContext(baseCE32)) { 682 oldCE32 = copyFromBaseCE32(c, baseCE32, true, errorCode); 683 utrie2_set32(trie, c, oldCE32, &errorCode); 684 if(U_FAILURE(errorCode)) { return; } 685 } 686 } 687 if(!hasContext) { 688 // No prefix, no contraction. 689 if(!isBuilderContextCE32(oldCE32)) { 690 utrie2_set32(trie, c, ce32, &errorCode); 691 } else { 692 ConditionalCE32 *cond = getConditionalCE32ForCE32(oldCE32); 693 cond->builtCE32 = Collation::NO_CE32; 694 cond->ce32 = ce32; 695 } 696 } else { 697 ConditionalCE32 *cond; 698 if(!isBuilderContextCE32(oldCE32)) { 699 // Replace the simple oldCE32 with a builder context CE32 700 // pointing to a new ConditionalCE32 list head. 701 int32_t index = addConditionalCE32(UnicodeString(static_cast<char16_t>(0)), oldCE32, errorCode); 702 if(U_FAILURE(errorCode)) { return; } 703 uint32_t contextCE32 = makeBuilderContextCE32(index); 704 utrie2_set32(trie, c, contextCE32, &errorCode); 705 contextChars.add(c); 706 cond = getConditionalCE32(index); 707 } else { 708 cond = getConditionalCE32ForCE32(oldCE32); 709 cond->builtCE32 = Collation::NO_CE32; 710 } 711 UnicodeString suffix(s, cLength); 712 UnicodeString context(static_cast<char16_t>(prefix.length())); 713 context.append(prefix).append(suffix); 714 if (icu4xMode && !suffix.isEmpty() && !prefix.isEmpty()) { 715 // ICU4X does not support the combination of prefix and contraction. 716 // This combination is supported by LDML but does not occur in the 717 // root or any tailorings in CLDR as of February 2025. 718 // If support for this case becomes necessary, a practical change 719 // would be allocating a flag on prefix ce32 and setting the 720 // flag on a prefix ce32 if any ce32 that can be found under 721 // the prefix ce32 (either the default or any UCharsTrie value) is 722 // a contraction ce32 or if the prefix ce32 is the utrie2 value 723 // for a character that is a starter that occurs in a middle 724 // (neither first nor last) position in a contraction. 725 errorCode = U_UNSUPPORTED_ERROR; 726 return; 727 } 728 unsafeBackwardSet.addAll(suffix); 729 for(;;) { 730 // invariant: context > cond->context 731 int32_t next = cond->next; 732 if(next < 0) { 733 // Append a new ConditionalCE32 after cond. 734 int32_t index = addConditionalCE32(context, ce32, errorCode); 735 if(U_FAILURE(errorCode)) { return; } 736 cond->next = index; 737 break; 738 } 739 ConditionalCE32 *nextCond = getConditionalCE32(next); 740 int8_t cmp = context.compare(nextCond->context); 741 if(cmp < 0) { 742 // Insert a new ConditionalCE32 between cond and nextCond. 743 int32_t index = addConditionalCE32(context, ce32, errorCode); 744 if(U_FAILURE(errorCode)) { return; } 745 cond->next = index; 746 getConditionalCE32(index)->next = next; 747 break; 748 } else if(cmp == 0) { 749 // Same context as before, overwrite its ce32. 750 nextCond->ce32 = ce32; 751 break; 752 } 753 cond = nextCond; 754 } 755 } 756 modified = true; 757 } 758 759 uint32_t 760 CollationDataBuilder::encodeOneCEAsCE32(int64_t ce) { 761 uint32_t p = static_cast<uint32_t>(ce >> 32); 762 uint32_t lower32 = static_cast<uint32_t>(ce); 763 uint32_t t = static_cast<uint32_t>(ce & 0xffff); 764 U_ASSERT((t & 0xc000) != 0xc000); // Impossible case bits 11 mark special CE32s. 765 if((ce & INT64_C(0xffff00ff00ff)) == 0) { 766 // normal form ppppsstt 767 return p | (lower32 >> 16) | (t >> 8); 768 } else if((ce & INT64_C(0xffffffffff)) == Collation::COMMON_SEC_AND_TER_CE) { 769 // long-primary form ppppppC1 770 return Collation::makeLongPrimaryCE32(p); 771 } else if(p == 0 && (t & 0xff) == 0) { 772 // long-secondary form ssssttC2 773 return Collation::makeLongSecondaryCE32(lower32); 774 } 775 return Collation::NO_CE32; 776 } 777 778 uint32_t 779 CollationDataBuilder::encodeOneCE(int64_t ce, UErrorCode &errorCode) { 780 // Try to encode one CE as one CE32. 781 uint32_t ce32 = encodeOneCEAsCE32(ce); 782 if(ce32 != Collation::NO_CE32) { return ce32; } 783 int32_t index = addCE(ce, errorCode); 784 if(U_FAILURE(errorCode)) { return 0; } 785 if(index > Collation::MAX_INDEX) { 786 errorCode = U_BUFFER_OVERFLOW_ERROR; 787 return 0; 788 } 789 return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION_TAG, index, 1); 790 } 791 792 uint32_t 793 CollationDataBuilder::encodeCEs(const int64_t ces[], int32_t cesLength, 794 UErrorCode &errorCode) { 795 if(U_FAILURE(errorCode)) { return 0; } 796 if(cesLength < 0 || cesLength > Collation::MAX_EXPANSION_LENGTH) { 797 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 798 return 0; 799 } 800 if(trie == nullptr || utrie2_isFrozen(trie)) { 801 errorCode = U_INVALID_STATE_ERROR; 802 return 0; 803 } 804 if(cesLength == 0) { 805 // Convenience: We cannot map to nothing, but we can map to a completely ignorable CE. 806 // Do this here so that callers need not do it. 807 return encodeOneCEAsCE32(0); 808 } else if(cesLength == 1) { 809 return encodeOneCE(ces[0], errorCode); 810 } else if(cesLength == 2 && !icu4xMode) { 811 // Try to encode two CEs as one CE32. 812 // Turn this off for ICU4X, because without the canonical closure 813 // these are so rare that it doesn't make sense to spend a branch 814 // on checking this tag when using the data. 815 int64_t ce0 = ces[0]; 816 int64_t ce1 = ces[1]; 817 uint32_t p0 = static_cast<uint32_t>(ce0 >> 32); 818 if((ce0 & INT64_C(0xffffffffff00ff)) == Collation::COMMON_SECONDARY_CE && 819 (ce1 & INT64_C(0xffffffff00ffffff)) == Collation::COMMON_TERTIARY_CE && 820 p0 != 0) { 821 // Latin mini expansion 822 return 823 p0 | 824 ((static_cast<uint32_t>(ce0) & 0xff00u) << 8) | 825 static_cast<uint32_t>(ce1 >> 16) | 826 Collation::SPECIAL_CE32_LOW_BYTE | 827 Collation::LATIN_EXPANSION_TAG; 828 } 829 } 830 // Try to encode two or more CEs as CE32s. 831 int32_t newCE32s[Collation::MAX_EXPANSION_LENGTH]; 832 for(int32_t i = 0;; ++i) { 833 if(i == cesLength) { 834 return encodeExpansion32(newCE32s, cesLength, errorCode); 835 } 836 uint32_t ce32 = encodeOneCEAsCE32(ces[i]); 837 if(ce32 == Collation::NO_CE32) { break; } 838 newCE32s[i] = static_cast<int32_t>(ce32); 839 } 840 return encodeExpansion(ces, cesLength, errorCode); 841 } 842 843 uint32_t 844 CollationDataBuilder::encodeExpansion(const int64_t ces[], int32_t length, UErrorCode &errorCode) { 845 if(U_FAILURE(errorCode)) { return 0; } 846 // See if this sequence of CEs has already been stored. 847 int64_t first = ces[0]; 848 int32_t ce64sMax = ce64s.size() - length; 849 for(int32_t i = 0; i <= ce64sMax; ++i) { 850 if(first == ce64s.elementAti(i)) { 851 if(i > Collation::MAX_INDEX) { 852 errorCode = U_BUFFER_OVERFLOW_ERROR; 853 return 0; 854 } 855 for(int32_t j = 1;; ++j) { 856 if(j == length) { 857 return Collation::makeCE32FromTagIndexAndLength( 858 Collation::EXPANSION_TAG, i, length); 859 } 860 if(ce64s.elementAti(i + j) != ces[j]) { break; } 861 } 862 } 863 } 864 // Store the new sequence. 865 int32_t i = ce64s.size(); 866 if(i > Collation::MAX_INDEX) { 867 errorCode = U_BUFFER_OVERFLOW_ERROR; 868 return 0; 869 } 870 for(int32_t j = 0; j < length; ++j) { 871 ce64s.addElement(ces[j], errorCode); 872 } 873 return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION_TAG, i, length); 874 } 875 876 uint32_t 877 CollationDataBuilder::encodeExpansion32(const int32_t newCE32s[], int32_t length, 878 UErrorCode &errorCode) { 879 if(U_FAILURE(errorCode)) { return 0; } 880 // See if this sequence of CE32s has already been stored. 881 int32_t first = newCE32s[0]; 882 int32_t ce32sMax = ce32s.size() - length; 883 for(int32_t i = 0; i <= ce32sMax; ++i) { 884 if(first == ce32s.elementAti(i)) { 885 if(i > Collation::MAX_INDEX) { 886 errorCode = U_BUFFER_OVERFLOW_ERROR; 887 return 0; 888 } 889 for(int32_t j = 1;; ++j) { 890 if(j == length) { 891 return Collation::makeCE32FromTagIndexAndLength( 892 Collation::EXPANSION32_TAG, i, length); 893 } 894 if(ce32s.elementAti(i + j) != newCE32s[j]) { break; } 895 } 896 } 897 } 898 // Store the new sequence. 899 int32_t i = ce32s.size(); 900 if(i > Collation::MAX_INDEX) { 901 errorCode = U_BUFFER_OVERFLOW_ERROR; 902 return 0; 903 } 904 for(int32_t j = 0; j < length; ++j) { 905 ce32s.addElement(newCE32s[j], errorCode); 906 } 907 return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION32_TAG, i, length); 908 } 909 910 uint32_t 911 CollationDataBuilder::copyFromBaseCE32(UChar32 c, uint32_t ce32, UBool withContext, 912 UErrorCode &errorCode) { 913 if(U_FAILURE(errorCode)) { return 0; } 914 if(!Collation::isSpecialCE32(ce32)) { return ce32; } 915 switch(Collation::tagFromCE32(ce32)) { 916 case Collation::LONG_PRIMARY_TAG: 917 case Collation::LONG_SECONDARY_TAG: 918 case Collation::LATIN_EXPANSION_TAG: 919 // copy as is 920 break; 921 case Collation::EXPANSION32_TAG: { 922 const uint32_t *baseCE32s = base->ce32s + Collation::indexFromCE32(ce32); 923 int32_t length = Collation::lengthFromCE32(ce32); 924 ce32 = encodeExpansion32( 925 reinterpret_cast<const int32_t *>(baseCE32s), length, errorCode); 926 break; 927 } 928 case Collation::EXPANSION_TAG: { 929 const int64_t *baseCEs = base->ces + Collation::indexFromCE32(ce32); 930 int32_t length = Collation::lengthFromCE32(ce32); 931 ce32 = encodeExpansion(baseCEs, length, errorCode); 932 break; 933 } 934 case Collation::PREFIX_TAG: { 935 // Flatten prefixes and nested suffixes (contractions) 936 // into a linear list of ConditionalCE32. 937 const char16_t *p = base->contexts + Collation::indexFromCE32(ce32); 938 ce32 = CollationData::readCE32(p); // Default if no prefix match. 939 if(!withContext) { 940 return copyFromBaseCE32(c, ce32, false, errorCode); 941 } 942 ConditionalCE32 head; 943 UnicodeString context(static_cast<char16_t>(0)); 944 int32_t index; 945 if(Collation::isContractionCE32(ce32)) { 946 index = copyContractionsFromBaseCE32(context, c, ce32, &head, errorCode); 947 } else { 948 ce32 = copyFromBaseCE32(c, ce32, true, errorCode); 949 head.next = index = addConditionalCE32(context, ce32, errorCode); 950 } 951 if(U_FAILURE(errorCode)) { return 0; } 952 ConditionalCE32 *cond = getConditionalCE32(index); // the last ConditionalCE32 so far 953 UCharsTrie::Iterator prefixes(p + 2, 0, errorCode); 954 while(prefixes.next(errorCode)) { 955 context = prefixes.getString(); 956 context.reverse(); 957 context.insert(0, static_cast<char16_t>(context.length())); 958 ce32 = static_cast<uint32_t>(prefixes.getValue()); 959 if(Collation::isContractionCE32(ce32)) { 960 index = copyContractionsFromBaseCE32(context, c, ce32, cond, errorCode); 961 } else { 962 ce32 = copyFromBaseCE32(c, ce32, true, errorCode); 963 cond->next = index = addConditionalCE32(context, ce32, errorCode); 964 } 965 if(U_FAILURE(errorCode)) { return 0; } 966 cond = getConditionalCE32(index); 967 } 968 ce32 = makeBuilderContextCE32(head.next); 969 contextChars.add(c); 970 break; 971 } 972 case Collation::CONTRACTION_TAG: { 973 if(!withContext) { 974 const char16_t *p = base->contexts + Collation::indexFromCE32(ce32); 975 ce32 = CollationData::readCE32(p); // Default if no suffix match. 976 return copyFromBaseCE32(c, ce32, false, errorCode); 977 } 978 ConditionalCE32 head; 979 UnicodeString context(static_cast<char16_t>(0)); 980 copyContractionsFromBaseCE32(context, c, ce32, &head, errorCode); 981 ce32 = makeBuilderContextCE32(head.next); 982 contextChars.add(c); 983 break; 984 } 985 case Collation::HANGUL_TAG: 986 errorCode = U_UNSUPPORTED_ERROR; // We forbid tailoring of Hangul syllables. 987 break; 988 case Collation::OFFSET_TAG: 989 ce32 = getCE32FromOffsetCE32(true, c, ce32); 990 break; 991 case Collation::IMPLICIT_TAG: 992 ce32 = encodeOneCE(Collation::unassignedCEFromCodePoint(c), errorCode); 993 break; 994 default: 995 UPRV_UNREACHABLE_EXIT; // require ce32 == base->getFinalCE32(ce32) 996 } 997 return ce32; 998 } 999 1000 int32_t 1001 CollationDataBuilder::copyContractionsFromBaseCE32(UnicodeString &context, UChar32 c, uint32_t ce32, 1002 ConditionalCE32 *cond, UErrorCode &errorCode) { 1003 if(U_FAILURE(errorCode)) { return 0; } 1004 const char16_t *p = base->contexts + Collation::indexFromCE32(ce32); 1005 int32_t index; 1006 if((ce32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) { 1007 // No match on the single code point. 1008 // We are underneath a prefix, and the default mapping is just 1009 // a fallback to the mappings for a shorter prefix. 1010 U_ASSERT(context.length() > 1); 1011 index = -1; 1012 } else { 1013 ce32 = CollationData::readCE32(p); // Default if no suffix match. 1014 U_ASSERT(!Collation::isContractionCE32(ce32)); 1015 ce32 = copyFromBaseCE32(c, ce32, true, errorCode); 1016 cond->next = index = addConditionalCE32(context, ce32, errorCode); 1017 if(U_FAILURE(errorCode)) { return 0; } 1018 cond = getConditionalCE32(index); 1019 } 1020 1021 int32_t suffixStart = context.length(); 1022 UCharsTrie::Iterator suffixes(p + 2, 0, errorCode); 1023 while(suffixes.next(errorCode)) { 1024 context.append(suffixes.getString()); 1025 ce32 = copyFromBaseCE32(c, static_cast<uint32_t>(suffixes.getValue()), true, errorCode); 1026 cond->next = index = addConditionalCE32(context, ce32, errorCode); 1027 if(U_FAILURE(errorCode)) { return 0; } 1028 // No need to update the unsafeBackwardSet because the tailoring set 1029 // is already a copy of the base set. 1030 cond = getConditionalCE32(index); 1031 context.truncate(suffixStart); 1032 } 1033 U_ASSERT(index >= 0); 1034 return index; 1035 } 1036 1037 class CopyHelper { 1038 public: 1039 CopyHelper(const CollationDataBuilder &s, CollationDataBuilder &d, 1040 const CollationDataBuilder::CEModifier &m, UErrorCode &initialErrorCode) 1041 : src(s), dest(d), modifier(m), 1042 errorCode(initialErrorCode) {} 1043 1044 UBool copyRangeCE32(UChar32 start, UChar32 end, uint32_t ce32) { 1045 ce32 = copyCE32(ce32); 1046 utrie2_setRange32(dest.trie, start, end, ce32, true, &errorCode); 1047 if(CollationDataBuilder::isBuilderContextCE32(ce32)) { 1048 dest.contextChars.add(start, end); 1049 } 1050 return U_SUCCESS(errorCode); 1051 } 1052 1053 uint32_t copyCE32(uint32_t ce32) { 1054 if(!Collation::isSpecialCE32(ce32)) { 1055 int64_t ce = modifier.modifyCE32(ce32); 1056 if(ce != Collation::NO_CE) { 1057 ce32 = dest.encodeOneCE(ce, errorCode); 1058 } 1059 } else { 1060 int32_t tag = Collation::tagFromCE32(ce32); 1061 if(tag == Collation::EXPANSION32_TAG) { 1062 const uint32_t *srcCE32s = reinterpret_cast<uint32_t *>(src.ce32s.getBuffer()); 1063 srcCE32s += Collation::indexFromCE32(ce32); 1064 int32_t length = Collation::lengthFromCE32(ce32); 1065 // Inspect the source CE32s. Just copy them if none are modified. 1066 // Otherwise copy to modifiedCEs, with modifications. 1067 UBool isModified = false; 1068 for(int32_t i = 0; i < length; ++i) { 1069 ce32 = srcCE32s[i]; 1070 int64_t ce; 1071 if(Collation::isSpecialCE32(ce32) || 1072 (ce = modifier.modifyCE32(ce32)) == Collation::NO_CE) { 1073 if(isModified) { 1074 modifiedCEs[i] = Collation::ceFromCE32(ce32); 1075 } 1076 } else { 1077 if(!isModified) { 1078 for(int32_t j = 0; j < i; ++j) { 1079 modifiedCEs[j] = Collation::ceFromCE32(srcCE32s[j]); 1080 } 1081 isModified = true; 1082 } 1083 modifiedCEs[i] = ce; 1084 } 1085 } 1086 if(isModified) { 1087 ce32 = dest.encodeCEs(modifiedCEs, length, errorCode); 1088 } else { 1089 ce32 = dest.encodeExpansion32( 1090 reinterpret_cast<const int32_t *>(srcCE32s), length, errorCode); 1091 } 1092 } else if(tag == Collation::EXPANSION_TAG) { 1093 const int64_t *srcCEs = src.ce64s.getBuffer(); 1094 srcCEs += Collation::indexFromCE32(ce32); 1095 int32_t length = Collation::lengthFromCE32(ce32); 1096 // Inspect the source CEs. Just copy them if none are modified. 1097 // Otherwise copy to modifiedCEs, with modifications. 1098 UBool isModified = false; 1099 for(int32_t i = 0; i < length; ++i) { 1100 int64_t srcCE = srcCEs[i]; 1101 int64_t ce = modifier.modifyCE(srcCE); 1102 if(ce == Collation::NO_CE) { 1103 if(isModified) { 1104 modifiedCEs[i] = srcCE; 1105 } 1106 } else { 1107 if(!isModified) { 1108 for(int32_t j = 0; j < i; ++j) { 1109 modifiedCEs[j] = srcCEs[j]; 1110 } 1111 isModified = true; 1112 } 1113 modifiedCEs[i] = ce; 1114 } 1115 } 1116 if(isModified) { 1117 ce32 = dest.encodeCEs(modifiedCEs, length, errorCode); 1118 } else { 1119 ce32 = dest.encodeExpansion(srcCEs, length, errorCode); 1120 } 1121 } else if(tag == Collation::BUILDER_DATA_TAG) { 1122 // Copy the list of ConditionalCE32. 1123 ConditionalCE32 *cond = src.getConditionalCE32ForCE32(ce32); 1124 U_ASSERT(!cond->hasContext()); 1125 int32_t destIndex = dest.addConditionalCE32( 1126 cond->context, copyCE32(cond->ce32), errorCode); 1127 ce32 = CollationDataBuilder::makeBuilderContextCE32(destIndex); 1128 while(cond->next >= 0) { 1129 cond = src.getConditionalCE32(cond->next); 1130 ConditionalCE32 *prevDestCond = dest.getConditionalCE32(destIndex); 1131 destIndex = dest.addConditionalCE32( 1132 cond->context, copyCE32(cond->ce32), errorCode); 1133 int32_t suffixStart = cond->prefixLength() + 1; 1134 dest.unsafeBackwardSet.addAll(cond->context.tempSubString(suffixStart)); 1135 prevDestCond->next = destIndex; 1136 } 1137 } else { 1138 // Just copy long CEs and Latin mini expansions (and other expected values) as is, 1139 // assuming that the modifier would not modify them. 1140 U_ASSERT(tag == Collation::LONG_PRIMARY_TAG || 1141 tag == Collation::LONG_SECONDARY_TAG || 1142 tag == Collation::LATIN_EXPANSION_TAG || 1143 tag == Collation::HANGUL_TAG); 1144 } 1145 } 1146 return ce32; 1147 } 1148 1149 const CollationDataBuilder &src; 1150 CollationDataBuilder &dest; 1151 const CollationDataBuilder::CEModifier &modifier; 1152 int64_t modifiedCEs[Collation::MAX_EXPANSION_LENGTH]; 1153 UErrorCode errorCode; 1154 }; 1155 1156 U_CDECL_BEGIN 1157 1158 static UBool U_CALLCONV 1159 enumRangeForCopy(const void *context, UChar32 start, UChar32 end, uint32_t value) { 1160 return 1161 value == Collation::UNASSIGNED_CE32 || value == Collation::FALLBACK_CE32 || 1162 ((CopyHelper *)context)->copyRangeCE32(start, end, value); 1163 } 1164 1165 U_CDECL_END 1166 1167 void 1168 CollationDataBuilder::copyFrom(const CollationDataBuilder &src, const CEModifier &modifier, 1169 UErrorCode &errorCode) { 1170 if(U_FAILURE(errorCode)) { return; } 1171 if(trie == nullptr || utrie2_isFrozen(trie)) { 1172 errorCode = U_INVALID_STATE_ERROR; 1173 return; 1174 } 1175 CopyHelper helper(src, *this, modifier, errorCode); 1176 utrie2_enum(src.trie, nullptr, enumRangeForCopy, &helper); 1177 errorCode = helper.errorCode; 1178 // Update the contextChars and the unsafeBackwardSet while copying, 1179 // in case a character had conditional mappings in the source builder 1180 // and they were removed later. 1181 modified |= src.modified; 1182 } 1183 1184 void 1185 CollationDataBuilder::optimize(const UnicodeSet &set, UErrorCode &errorCode) { 1186 if(U_FAILURE(errorCode) || set.isEmpty()) { return; } 1187 UnicodeSetIterator iter(set); 1188 while(iter.next() && !iter.isString()) { 1189 UChar32 c = iter.getCodepoint(); 1190 uint32_t ce32 = utrie2_get32(trie, c); 1191 if(ce32 == Collation::FALLBACK_CE32) { 1192 ce32 = base->getFinalCE32(base->getCE32(c)); 1193 ce32 = copyFromBaseCE32(c, ce32, true, errorCode); 1194 utrie2_set32(trie, c, ce32, &errorCode); 1195 } 1196 } 1197 modified = true; 1198 } 1199 1200 void 1201 CollationDataBuilder::suppressContractions(const UnicodeSet &set, UErrorCode &errorCode) { 1202 if(U_FAILURE(errorCode) || set.isEmpty()) { return; } 1203 UnicodeSetIterator iter(set); 1204 while(iter.next() && !iter.isString()) { 1205 UChar32 c = iter.getCodepoint(); 1206 uint32_t ce32 = utrie2_get32(trie, c); 1207 if(ce32 == Collation::FALLBACK_CE32) { 1208 ce32 = base->getFinalCE32(base->getCE32(c)); 1209 if(Collation::ce32HasContext(ce32)) { 1210 ce32 = copyFromBaseCE32(c, ce32, false /* without context */, errorCode); 1211 utrie2_set32(trie, c, ce32, &errorCode); 1212 } 1213 } else if(isBuilderContextCE32(ce32)) { 1214 ce32 = getConditionalCE32ForCE32(ce32)->ce32; 1215 // Simply abandon the list of ConditionalCE32. 1216 // The caller will copy this builder in the end, 1217 // eliminating unreachable data. 1218 utrie2_set32(trie, c, ce32, &errorCode); 1219 contextChars.remove(c); 1220 } 1221 } 1222 modified = true; 1223 } 1224 1225 UBool 1226 CollationDataBuilder::getJamoCE32s(uint32_t jamoCE32s[], UErrorCode &errorCode) { 1227 if(U_FAILURE(errorCode)) { return false; } 1228 UBool anyJamoAssigned = base == nullptr; // always set jamoCE32s in the base data 1229 UBool needToCopyFromBase = false; 1230 for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) { // Count across Jamo types. 1231 UChar32 jamo = jamoCpFromIndex(j); 1232 UBool fromBase = false; 1233 uint32_t ce32 = utrie2_get32(trie, jamo); 1234 anyJamoAssigned |= Collation::isAssignedCE32(ce32); 1235 // TODO: Try to prevent [optimize [Jamo]] from counting as anyJamoAssigned. 1236 // (As of CLDR 24 [2013] the Korean tailoring does not optimize conjoining Jamo.) 1237 if(ce32 == Collation::FALLBACK_CE32) { 1238 fromBase = true; 1239 ce32 = base->getCE32(jamo); 1240 } 1241 if(Collation::isSpecialCE32(ce32)) { 1242 switch(Collation::tagFromCE32(ce32)) { 1243 case Collation::LONG_PRIMARY_TAG: 1244 case Collation::LONG_SECONDARY_TAG: 1245 case Collation::LATIN_EXPANSION_TAG: 1246 // Copy the ce32 as-is. 1247 break; 1248 case Collation::EXPANSION32_TAG: 1249 case Collation::EXPANSION_TAG: 1250 case Collation::PREFIX_TAG: 1251 case Collation::CONTRACTION_TAG: 1252 if(fromBase) { 1253 // Defer copying until we know if anyJamoAssigned. 1254 ce32 = Collation::FALLBACK_CE32; 1255 needToCopyFromBase = true; 1256 } 1257 break; 1258 case Collation::IMPLICIT_TAG: 1259 // An unassigned Jamo should only occur in tests with incomplete bases. 1260 U_ASSERT(fromBase); 1261 ce32 = Collation::FALLBACK_CE32; 1262 needToCopyFromBase = true; 1263 break; 1264 case Collation::OFFSET_TAG: 1265 ce32 = getCE32FromOffsetCE32(fromBase, jamo, ce32); 1266 break; 1267 case Collation::FALLBACK_TAG: 1268 case Collation::RESERVED_TAG_3: 1269 case Collation::BUILDER_DATA_TAG: 1270 case Collation::DIGIT_TAG: 1271 case Collation::U0000_TAG: 1272 case Collation::HANGUL_TAG: 1273 case Collation::LEAD_SURROGATE_TAG: 1274 errorCode = U_INTERNAL_PROGRAM_ERROR; 1275 return false; 1276 } 1277 } 1278 jamoCE32s[j] = ce32; 1279 } 1280 if(anyJamoAssigned && needToCopyFromBase) { 1281 for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) { 1282 if(jamoCE32s[j] == Collation::FALLBACK_CE32) { 1283 UChar32 jamo = jamoCpFromIndex(j); 1284 jamoCE32s[j] = copyFromBaseCE32(jamo, base->getCE32(jamo), 1285 /*withContext=*/ true, errorCode); 1286 } 1287 } 1288 } 1289 return anyJamoAssigned && U_SUCCESS(errorCode); 1290 } 1291 1292 void 1293 CollationDataBuilder::setDigitTags(UErrorCode &errorCode) { 1294 UnicodeSet digits(UNICODE_STRING_SIMPLE("[:Nd:]"), errorCode); 1295 if(U_FAILURE(errorCode)) { return; } 1296 UnicodeSetIterator iter(digits); 1297 while(iter.next()) { 1298 U_ASSERT(!iter.isString()); 1299 UChar32 c = iter.getCodepoint(); 1300 uint32_t ce32 = utrie2_get32(trie, c); 1301 if(ce32 != Collation::FALLBACK_CE32 && ce32 != Collation::UNASSIGNED_CE32) { 1302 int32_t index = addCE32(ce32, errorCode); 1303 if(U_FAILURE(errorCode)) { return; } 1304 if(index > Collation::MAX_INDEX) { 1305 errorCode = U_BUFFER_OVERFLOW_ERROR; 1306 return; 1307 } 1308 ce32 = Collation::makeCE32FromTagIndexAndLength( 1309 Collation::DIGIT_TAG, index, u_charDigitValue(c)); 1310 utrie2_set32(trie, c, ce32, &errorCode); 1311 } 1312 } 1313 } 1314 1315 U_CDECL_BEGIN 1316 1317 static UBool U_CALLCONV 1318 enumRangeLeadValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) { 1319 int32_t *pValue = (int32_t *)context; 1320 if(value == Collation::UNASSIGNED_CE32) { 1321 value = Collation::LEAD_ALL_UNASSIGNED; 1322 } else if(value == Collation::FALLBACK_CE32) { 1323 value = Collation::LEAD_ALL_FALLBACK; 1324 } else { 1325 *pValue = Collation::LEAD_MIXED; 1326 return false; 1327 } 1328 if(*pValue < 0) { 1329 *pValue = (int32_t)value; 1330 } else if(*pValue != (int32_t)value) { 1331 *pValue = Collation::LEAD_MIXED; 1332 return false; 1333 } 1334 return true; 1335 } 1336 1337 U_CDECL_END 1338 1339 void 1340 CollationDataBuilder::setLeadSurrogates(UErrorCode &errorCode) { 1341 for(char16_t lead = 0xd800; lead < 0xdc00; ++lead) { 1342 int32_t value = -1; 1343 utrie2_enumForLeadSurrogate(trie, lead, nullptr, enumRangeLeadValue, &value); 1344 utrie2_set32ForLeadSurrogateCodeUnit( 1345 trie, lead, 1346 Collation::makeCE32FromTagAndIndex(Collation::LEAD_SURROGATE_TAG, 0) | static_cast<uint32_t>(value), 1347 &errorCode); 1348 } 1349 } 1350 1351 void 1352 CollationDataBuilder::build(CollationData &data, UErrorCode &errorCode) { 1353 buildMappings(data, errorCode); 1354 if(base != nullptr) { 1355 data.numericPrimary = base->numericPrimary; 1356 data.compressibleBytes = base->compressibleBytes; 1357 data.numScripts = base->numScripts; 1358 data.scriptsIndex = base->scriptsIndex; 1359 data.scriptStarts = base->scriptStarts; 1360 data.scriptStartsLength = base->scriptStartsLength; 1361 } 1362 buildFastLatinTable(data, errorCode); 1363 } 1364 1365 void 1366 CollationDataBuilder::buildMappings(CollationData &data, UErrorCode &errorCode) { 1367 if(U_FAILURE(errorCode)) { return; } 1368 if(trie == nullptr || utrie2_isFrozen(trie)) { 1369 errorCode = U_INVALID_STATE_ERROR; 1370 return; 1371 } 1372 1373 buildContexts(errorCode); 1374 1375 uint32_t jamoCE32s[CollationData::JAMO_CE32S_LENGTH]; 1376 int32_t jamoIndex = -1; 1377 if(getJamoCE32s(jamoCE32s, errorCode)) { 1378 jamoIndex = ce32s.size(); 1379 for(int32_t i = 0; i < CollationData::JAMO_CE32S_LENGTH; ++i) { 1380 ce32s.addElement(static_cast<int32_t>(jamoCE32s[i]), errorCode); 1381 } 1382 // Small optimization: Use a bit in the Hangul ce32 1383 // to indicate that none of the Jamo CE32s are isSpecialCE32() 1384 // (as it should be in the root collator). 1385 // It allows CollationIterator to avoid recursive function calls and per-Jamo tests. 1386 // In order to still have good trie compression and keep this code simple, 1387 // we only set this flag if a whole block of 588 Hangul syllables starting with 1388 // a common leading consonant (Jamo L) has this property. 1389 UBool isAnyJamoVTSpecial = false; 1390 for(int32_t i = Hangul::JAMO_L_COUNT; i < CollationData::JAMO_CE32S_LENGTH; ++i) { 1391 if(Collation::isSpecialCE32(jamoCE32s[i])) { 1392 isAnyJamoVTSpecial = true; 1393 break; 1394 } 1395 } 1396 uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0); 1397 UChar32 c = Hangul::HANGUL_BASE; 1398 for(int32_t i = 0; i < Hangul::JAMO_L_COUNT; ++i) { // iterate over the Jamo L 1399 uint32_t ce32 = hangulCE32; 1400 if(!isAnyJamoVTSpecial && !Collation::isSpecialCE32(jamoCE32s[i])) { 1401 ce32 |= Collation::HANGUL_NO_SPECIAL_JAMO; 1402 } 1403 UChar32 limit = c + Hangul::JAMO_VT_COUNT; 1404 utrie2_setRange32(trie, c, limit - 1, ce32, true, &errorCode); 1405 c = limit; 1406 } 1407 } else { 1408 // Copy the Hangul CE32s from the base in blocks per Jamo L, 1409 // assuming that HANGUL_NO_SPECIAL_JAMO is set or not set for whole blocks. 1410 for(UChar32 c = Hangul::HANGUL_BASE; c < Hangul::HANGUL_LIMIT;) { 1411 uint32_t ce32 = base->getCE32(c); 1412 U_ASSERT(Collation::hasCE32Tag(ce32, Collation::HANGUL_TAG)); 1413 UChar32 limit = c + Hangul::JAMO_VT_COUNT; 1414 utrie2_setRange32(trie, c, limit - 1, ce32, true, &errorCode); 1415 c = limit; 1416 } 1417 } 1418 1419 setDigitTags(errorCode); 1420 setLeadSurrogates(errorCode); 1421 1422 if (icu4xMode) { 1423 // Make sure that starters that occur is the middle of a 1424 // contraction have contraction ce32 with the 1425 // `CONTRACT_HAS_STARTER` flag set so that starters that 1426 // can occur in a non-final position in a contraction can 1427 // be easily recognized from having a contraction ce32 1428 // that has the `CONTRACT_HAS_STARTER` flag set. 1429 1430 UCharsTrieBuilder contractionBuilder(errorCode); 1431 // Intentionally unpaired low surrogate to make it never 1432 // match well-formed UTF-16 which ICU4X feeds to the 1433 // matcher. 1434 UnicodeString placeholder(0xDC00); 1435 1436 for (UChar32 c : contractionMiddleStarter.codePoints()) { 1437 uint32_t ce32 = utrie2_get32(trie, c); 1438 UBool fromBase = false; 1439 if(ce32 == Collation::FALLBACK_CE32) { 1440 fromBase = true; 1441 ce32 = base->getCE32(c); 1442 } 1443 if (!(Collation::hasCE32Tag(ce32, Collation::CONTRACTION_TAG) && (ce32 & Collation::CONTRACT_HAS_STARTER))) { 1444 if (fromBase) { 1445 // This case does not actually happen as of February 2025. 1446 ce32 = copyFromBaseCE32(c, ce32, true, errorCode); 1447 } 1448 if (Collation::hasCE32Tag(ce32, Collation::CONTRACTION_TAG)) { 1449 // This middle starter is also the first character of another 1450 // contraction, but that contraction does not have the 1451 // CONTRACT_HAS_STARTER flag. Let's add the flag to 1452 // mark this at the expense of pessimizing the matching 1453 // of this contraction. 1454 // As of February 2025, this case does not actually occur 1455 // in CLDR. 1456 ce32 |= Collation::CONTRACT_HAS_STARTER; 1457 } else { 1458 // This middle starter is not also the first character 1459 // in another contraction. 1460 1461 // The UCharsTrie needs to contain some placeholder 1462 // because it cannot be empty. We build a trie 1463 // that never actually matches anything that ICU4X can try to 1464 // match, since ICU4X always passes well-formed UTF-16 to the 1465 // matcher and we put an unpaired low surrogate into the trie. 1466 // This pessimizes the character to CE mapping of the `c`, 1467 // since useless trie matching will be attempted but as of 1468 // February 2025, only two relatively rare characters are affected. 1469 contractionBuilder.clear(); 1470 contractionBuilder.add(placeholder, static_cast<int32_t>(ce32), errorCode); 1471 1472 int32_t index = addContextTrie(ce32, contractionBuilder, errorCode); 1473 if(U_FAILURE(errorCode)) { return; } 1474 if(index > Collation::MAX_INDEX) { 1475 errorCode = U_BUFFER_OVERFLOW_ERROR; 1476 return; 1477 } 1478 // Set CONTRACT_HAS_STARTER to make identical prefix matching able to catch this. 1479 ce32 = Collation::makeCE32FromTagAndIndex(Collation::CONTRACTION_TAG, index) | Collation::CONTRACT_HAS_STARTER; 1480 } 1481 utrie2_set32(trie, c, ce32, &errorCode); 1482 } 1483 } 1484 } else { 1485 // For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG. 1486 ce32s.setElementAt(static_cast<int32_t>(utrie2_get32(trie, 0)), 0); 1487 utrie2_set32(trie, 0, Collation::makeCE32FromTagAndIndex(Collation::U0000_TAG, 0), &errorCode); 1488 } 1489 1490 utrie2_freeze(trie, UTRIE2_32_VALUE_BITS, &errorCode); 1491 if(U_FAILURE(errorCode)) { return; } 1492 1493 // Mark each lead surrogate as "unsafe" 1494 // if any of its 1024 associated supplementary code points is "unsafe". 1495 UChar32 c = 0x10000; 1496 for(char16_t lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) { 1497 if(unsafeBackwardSet.containsSome(c, c + 0x3ff)) { 1498 unsafeBackwardSet.add(lead); 1499 } 1500 } 1501 unsafeBackwardSet.freeze(); 1502 1503 data.trie = trie; 1504 data.ce32s = reinterpret_cast<const uint32_t *>(ce32s.getBuffer()); 1505 data.ces = ce64s.getBuffer(); 1506 data.contexts = contexts.getBuffer(); 1507 1508 data.ce32sLength = ce32s.size(); 1509 data.cesLength = ce64s.size(); 1510 data.contextsLength = contexts.length(); 1511 1512 data.base = base; 1513 if(jamoIndex >= 0) { 1514 data.jamoCE32s = data.ce32s + jamoIndex; 1515 } else { 1516 data.jamoCE32s = base->jamoCE32s; 1517 } 1518 data.unsafeBackwardSet = &unsafeBackwardSet; 1519 } 1520 1521 void 1522 CollationDataBuilder::clearContexts() { 1523 contexts.remove(); 1524 // Incrementing the contexts build "era" invalidates all of the builtCE32 1525 // from before this clearContexts() call. 1526 // Simpler than finding and resetting all of those fields. 1527 ++contextsEra; 1528 } 1529 1530 void 1531 CollationDataBuilder::buildContexts(UErrorCode &errorCode) { 1532 if(U_FAILURE(errorCode)) { return; } 1533 // Ignore abandoned lists and the cached builtCE32, 1534 // and build all contexts from scratch. 1535 clearContexts(); 1536 UnicodeSetIterator iter(contextChars); 1537 while(U_SUCCESS(errorCode) && iter.next()) { 1538 U_ASSERT(!iter.isString()); 1539 UChar32 c = iter.getCodepoint(); 1540 uint32_t ce32 = utrie2_get32(trie, c); 1541 if(!isBuilderContextCE32(ce32)) { 1542 // Impossible: No context data for c in contextChars. 1543 errorCode = U_INTERNAL_PROGRAM_ERROR; 1544 return; 1545 } 1546 ConditionalCE32 *cond = getConditionalCE32ForCE32(ce32); 1547 ce32 = buildContext(cond, errorCode); 1548 utrie2_set32(trie, c, ce32, &errorCode); 1549 } 1550 } 1551 1552 uint32_t 1553 CollationDataBuilder::buildContext(ConditionalCE32 *head, UErrorCode &errorCode) { 1554 if(U_FAILURE(errorCode)) { return 0; } 1555 // The list head must have no context. 1556 U_ASSERT(!head->hasContext()); 1557 // The list head must be followed by one or more nodes that all do have context. 1558 U_ASSERT(head->next >= 0); 1559 UCharsTrieBuilder prefixBuilder(errorCode); 1560 UCharsTrieBuilder contractionBuilder(errorCode); 1561 // This outer loop goes from each prefix to the next. 1562 // For each prefix it finds the one or more same-prefix entries (firstCond..lastCond). 1563 // If there are multiple suffixes for the same prefix, 1564 // then an inner loop builds a contraction trie for them. 1565 for(ConditionalCE32 *cond = head;; cond = getConditionalCE32(cond->next)) { 1566 if(U_FAILURE(errorCode)) { return 0; } // early out for memory allocation errors 1567 // After the list head, the prefix or suffix can be empty, but not both. 1568 U_ASSERT(cond == head || cond->hasContext()); 1569 int32_t prefixLength = cond->prefixLength(); 1570 UnicodeString prefix(cond->context, 0, prefixLength + 1); 1571 // Collect all contraction suffixes for one prefix. 1572 ConditionalCE32 *firstCond = cond; 1573 ConditionalCE32 *lastCond; 1574 do { 1575 lastCond = cond; 1576 // Clear the defaultCE32 fields as we go. 1577 // They are left over from building a previous version of this list of contexts. 1578 // 1579 // One of the code paths below may copy a preceding defaultCE32 1580 // into its emptySuffixCE32. 1581 // If a new suffix has been inserted before what used to be 1582 // the firstCond for its prefix, then that previous firstCond could still 1583 // contain an outdated defaultCE32 from an earlier buildContext() and 1584 // result in an incorrect emptySuffixCE32. 1585 // So we reset all defaultCE32 before reading and setting new values. 1586 cond->defaultCE32 = Collation::NO_CE32; 1587 } while(cond->next >= 0 && 1588 (cond = getConditionalCE32(cond->next))->context.startsWith(prefix)); 1589 uint32_t ce32; 1590 int32_t suffixStart = prefixLength + 1; // == prefix.length() 1591 if(lastCond->context.length() == suffixStart) { 1592 // One prefix without contraction suffix. 1593 U_ASSERT(firstCond == lastCond); 1594 ce32 = lastCond->ce32; 1595 cond = lastCond; 1596 } else { 1597 // Build the contractions trie. 1598 contractionBuilder.clear(); 1599 // Entry for an empty suffix, to be stored before the trie. 1600 uint32_t emptySuffixCE32 = 0; 1601 uint32_t flags = 0; 1602 if(firstCond->context.length() == suffixStart) { 1603 // There is a mapping for the prefix and the single character c. (p|c) 1604 // If no other suffix matches, then we return this value. 1605 emptySuffixCE32 = firstCond->ce32; 1606 cond = getConditionalCE32(firstCond->next); 1607 } else { 1608 // There is no mapping for the prefix and just the single character. 1609 // (There is no p|c, only p|cd, p|ce etc.) 1610 flags |= Collation::CONTRACT_SINGLE_CP_NO_MATCH; 1611 // When the prefix matches but none of the prefix-specific suffixes, 1612 // then we fall back to the mappings with the next-longest prefix, 1613 // and ultimately to mappings with no prefix. 1614 // Each fallback might be another set of contractions. 1615 // For example, if there are mappings for ch, p|cd, p|ce, but not for p|c, 1616 // then in text "pch" we find the ch contraction. 1617 for(cond = head;; cond = getConditionalCE32(cond->next)) { 1618 int32_t length = cond->prefixLength(); 1619 if(length == prefixLength) { break; } 1620 if(cond->defaultCE32 != Collation::NO_CE32 && 1621 (length==0 || prefix.endsWith(cond->context, 1, length))) { 1622 emptySuffixCE32 = cond->defaultCE32; 1623 } 1624 } 1625 cond = firstCond; 1626 } 1627 // Optimization: Set a flag when 1628 // the first character of every contraction suffix has lccc!=0. 1629 // Short-circuits contraction matching when a normal letter follows. 1630 flags |= Collation::CONTRACT_NEXT_CCC; 1631 // Add all of the non-empty suffixes into the contraction trie. 1632 for(;;) { 1633 UnicodeString suffix(cond->context, suffixStart); 1634 uint16_t fcd16 = nfcImpl.getFCD16(suffix.char32At(0)); 1635 if(fcd16 <= 0xff) { 1636 flags &= ~Collation::CONTRACT_NEXT_CCC; 1637 } 1638 fcd16 = nfcImpl.getFCD16(suffix.char32At(suffix.length() - 1)); 1639 if(fcd16 > 0xff) { 1640 // The last suffix character has lccc!=0, allowing for discontiguous contractions. 1641 flags |= Collation::CONTRACT_TRAILING_CCC; 1642 } 1643 if (icu4xMode && (flags & Collation::CONTRACT_HAS_STARTER) == 0) { 1644 for (int32_t i = 0; i < suffix.length();) { 1645 UChar32 c = suffix.char32At(i); 1646 if (!u_getCombiningClass(c)) { 1647 flags |= Collation::CONTRACT_HAS_STARTER; 1648 break; 1649 } 1650 if (c > 0xFFFF) { 1651 i += 2; 1652 } else { 1653 ++i; 1654 } 1655 } 1656 } 1657 contractionBuilder.add(suffix, static_cast<int32_t>(cond->ce32), errorCode); 1658 if(cond == lastCond) { break; } 1659 cond = getConditionalCE32(cond->next); 1660 } 1661 int32_t index = addContextTrie(emptySuffixCE32, contractionBuilder, errorCode); 1662 if(U_FAILURE(errorCode)) { return 0; } 1663 if(index > Collation::MAX_INDEX) { 1664 errorCode = U_BUFFER_OVERFLOW_ERROR; 1665 return 0; 1666 } 1667 ce32 = Collation::makeCE32FromTagAndIndex(Collation::CONTRACTION_TAG, index) | flags; 1668 } 1669 U_ASSERT(cond == lastCond); 1670 firstCond->defaultCE32 = ce32; 1671 if(prefixLength == 0) { 1672 if(cond->next < 0) { 1673 // No non-empty prefixes, only contractions. 1674 return ce32; 1675 } 1676 } else { 1677 prefix.remove(0, 1); // Remove the length unit. 1678 prefix.reverse(); 1679 prefixBuilder.add(prefix, static_cast<int32_t>(ce32), errorCode); 1680 if(cond->next < 0) { break; } 1681 } 1682 } 1683 U_ASSERT(head->defaultCE32 != Collation::NO_CE32); 1684 int32_t index = addContextTrie(head->defaultCE32, prefixBuilder, errorCode); 1685 if(U_FAILURE(errorCode)) { return 0; } 1686 if(index > Collation::MAX_INDEX) { 1687 errorCode = U_BUFFER_OVERFLOW_ERROR; 1688 return 0; 1689 } 1690 return Collation::makeCE32FromTagAndIndex(Collation::PREFIX_TAG, index); 1691 } 1692 1693 int32_t 1694 CollationDataBuilder::addContextTrie(uint32_t defaultCE32, UCharsTrieBuilder &trieBuilder, 1695 UErrorCode &errorCode) { 1696 UnicodeString context; 1697 context.append(static_cast<char16_t>(defaultCE32 >> 16)).append(static_cast<char16_t>(defaultCE32)); 1698 UnicodeString trieString; 1699 context.append(trieBuilder.buildUnicodeString(USTRINGTRIE_BUILD_SMALL, trieString, errorCode)); 1700 if(U_FAILURE(errorCode)) { return -1; } 1701 int32_t index = contexts.indexOf(context); 1702 if(index < 0) { 1703 index = contexts.length(); 1704 contexts.append(context); 1705 } 1706 return index; 1707 } 1708 1709 void 1710 CollationDataBuilder::buildFastLatinTable(CollationData &data, UErrorCode &errorCode) { 1711 if(U_FAILURE(errorCode) || !fastLatinEnabled) { return; } 1712 1713 delete fastLatinBuilder; 1714 fastLatinBuilder = new CollationFastLatinBuilder(errorCode); 1715 if(fastLatinBuilder == nullptr) { 1716 errorCode = U_MEMORY_ALLOCATION_ERROR; 1717 return; 1718 } 1719 if(fastLatinBuilder->forData(data, errorCode)) { 1720 const uint16_t *table = fastLatinBuilder->getTable(); 1721 int32_t length = fastLatinBuilder->lengthOfTable(); 1722 if(base != nullptr && length == base->fastLatinTableLength && 1723 uprv_memcmp(table, base->fastLatinTable, length * 2) == 0) { 1724 // Same fast Latin table as in the base, use that one instead. 1725 delete fastLatinBuilder; 1726 fastLatinBuilder = nullptr; 1727 table = base->fastLatinTable; 1728 } 1729 data.fastLatinTable = table; 1730 data.fastLatinTableLength = length; 1731 } else { 1732 delete fastLatinBuilder; 1733 fastLatinBuilder = nullptr; 1734 } 1735 } 1736 1737 int32_t 1738 CollationDataBuilder::getCEs(const UnicodeString &s, int64_t ces[], int32_t cesLength) { 1739 return getCEs(s, 0, ces, cesLength); 1740 } 1741 1742 int32_t 1743 CollationDataBuilder::getCEs(const UnicodeString &prefix, const UnicodeString &s, 1744 int64_t ces[], int32_t cesLength) { 1745 int32_t prefixLength = prefix.length(); 1746 if(prefixLength == 0) { 1747 return getCEs(s, 0, ces, cesLength); 1748 } else { 1749 return getCEs(prefix + s, prefixLength, ces, cesLength); 1750 } 1751 } 1752 1753 int32_t 1754 CollationDataBuilder::getCEs(const UnicodeString &s, int32_t start, 1755 int64_t ces[], int32_t cesLength) { 1756 if(collIter == nullptr) { 1757 collIter = new DataBuilderCollationIterator(*this); 1758 if(collIter == nullptr) { return 0; } 1759 } 1760 return collIter->fetchCEs(s, start, ces, cesLength); 1761 } 1762 1763 U_NAMESPACE_END 1764 1765 #endif // !UCONFIG_NO_COLLATION