rulebasedcollator.cpp (61988B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 1996-2015, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * rulebasedcollator.cpp 9 * 10 * (replaced the former tblcoll.cpp) 11 * 12 * created on: 2012feb14 with new and old collation code 13 * created by: Markus W. Scherer 14 */ 15 16 #include "unicode/utypes.h" 17 18 #if !UCONFIG_NO_COLLATION 19 20 #include "unicode/coll.h" 21 #include "unicode/coleitr.h" 22 #include "unicode/localpointer.h" 23 #include "unicode/locid.h" 24 #include "unicode/sortkey.h" 25 #include "unicode/tblcoll.h" 26 #include "unicode/ucol.h" 27 #include "unicode/uiter.h" 28 #include "unicode/uloc.h" 29 #include "unicode/uniset.h" 30 #include "unicode/unistr.h" 31 #include "unicode/usetiter.h" 32 #include "unicode/utf8.h" 33 #include "unicode/uversion.h" 34 #include "bocsu.h" 35 #include "charstr.h" 36 #include "cmemory.h" 37 #include "collation.h" 38 #include "collationcompare.h" 39 #include "collationdata.h" 40 #include "collationdatareader.h" 41 #include "collationfastlatin.h" 42 #include "collationiterator.h" 43 #include "collationkeys.h" 44 #include "collationroot.h" 45 #include "collationsets.h" 46 #include "collationsettings.h" 47 #include "collationtailoring.h" 48 #include "cstring.h" 49 #include "uassert.h" 50 #include "ucol_imp.h" 51 #include "uhash.h" 52 #include "uitercollationiterator.h" 53 #include "ulocimp.h" 54 #include "ustr_imp.h" 55 #include "utf16collationiterator.h" 56 #include "utf8collationiterator.h" 57 #include "uvectr64.h" 58 59 U_NAMESPACE_BEGIN 60 61 namespace { 62 63 class FixedSortKeyByteSink : public SortKeyByteSink { 64 public: 65 FixedSortKeyByteSink(char *dest, int32_t destCapacity) 66 : SortKeyByteSink(dest, destCapacity) {} 67 virtual ~FixedSortKeyByteSink(); 68 69 private: 70 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) override; 71 virtual UBool Resize(int32_t appendCapacity, int32_t length) override; 72 }; 73 74 FixedSortKeyByteSink::~FixedSortKeyByteSink() {} 75 76 void 77 FixedSortKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t /*n*/, int32_t length) { 78 // buffer_ != nullptr && bytes != nullptr && n > 0 && appended_ > capacity_ 79 // Fill the buffer completely. 80 int32_t available = capacity_ - length; 81 if (available > 0) { 82 uprv_memcpy(buffer_ + length, bytes, available); 83 } 84 } 85 86 UBool 87 FixedSortKeyByteSink::Resize(int32_t /*appendCapacity*/, int32_t /*length*/) { 88 return false; 89 } 90 91 } // namespace 92 93 // Not in an anonymous namespace, so that it can be a friend of CollationKey. 94 class CollationKeyByteSink : public SortKeyByteSink { 95 public: 96 CollationKeyByteSink(CollationKey &key) 97 : SortKeyByteSink(reinterpret_cast<char *>(key.getBytes()), key.getCapacity()), 98 key_(key) {} 99 virtual ~CollationKeyByteSink(); 100 101 private: 102 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) override; 103 virtual UBool Resize(int32_t appendCapacity, int32_t length) override; 104 105 CollationKey &key_; 106 }; 107 108 CollationKeyByteSink::~CollationKeyByteSink() {} 109 110 void 111 CollationKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) { 112 // buffer_ != nullptr && bytes != nullptr && n > 0 && appended_ > capacity_ 113 if (Resize(n, length)) { 114 uprv_memcpy(buffer_ + length, bytes, n); 115 } 116 } 117 118 UBool 119 CollationKeyByteSink::Resize(int32_t appendCapacity, int32_t length) { 120 if (buffer_ == nullptr) { 121 return false; // allocation failed before already 122 } 123 int32_t newCapacity = 2 * capacity_; 124 int32_t altCapacity = length + 2 * appendCapacity; 125 if (newCapacity < altCapacity) { 126 newCapacity = altCapacity; 127 } 128 if (newCapacity < 200) { 129 newCapacity = 200; 130 } 131 uint8_t *newBuffer = key_.reallocate(newCapacity, length); 132 if (newBuffer == nullptr) { 133 SetNotOk(); 134 return false; 135 } 136 buffer_ = reinterpret_cast<char *>(newBuffer); 137 capacity_ = newCapacity; 138 return true; 139 } 140 141 RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator &other) 142 : Collator(other), 143 data(other.data), 144 settings(other.settings), 145 tailoring(other.tailoring), 146 cacheEntry(other.cacheEntry), 147 validLocale(other.validLocale), 148 explicitlySetAttributes(other.explicitlySetAttributes), 149 actualLocaleIsSameAsValid(other.actualLocaleIsSameAsValid) { 150 settings->addRef(); 151 cacheEntry->addRef(); 152 } 153 154 RuleBasedCollator::RuleBasedCollator(const uint8_t *bin, int32_t length, 155 const RuleBasedCollator *base, UErrorCode &errorCode) 156 : data(nullptr), 157 settings(nullptr), 158 tailoring(nullptr), 159 cacheEntry(nullptr), 160 validLocale(""), 161 explicitlySetAttributes(0), 162 actualLocaleIsSameAsValid(false) { 163 if(U_FAILURE(errorCode)) { return; } 164 if(bin == nullptr || length == 0 || base == nullptr) { 165 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 166 return; 167 } 168 const CollationTailoring *root = CollationRoot::getRoot(errorCode); 169 if(U_FAILURE(errorCode)) { return; } 170 if(base->tailoring != root) { 171 errorCode = U_UNSUPPORTED_ERROR; 172 return; 173 } 174 LocalPointer<CollationTailoring> t(new CollationTailoring(base->tailoring->settings)); 175 if(t.isNull() || t->isBogus()) { 176 errorCode = U_MEMORY_ALLOCATION_ERROR; 177 return; 178 } 179 CollationDataReader::read(base->tailoring, bin, length, *t, errorCode); 180 if(U_FAILURE(errorCode)) { return; } 181 t->actualLocale.setToBogus(); 182 adoptTailoring(t.orphan(), errorCode); 183 } 184 185 RuleBasedCollator::RuleBasedCollator(const CollationCacheEntry *entry) 186 : data(entry->tailoring->data), 187 settings(entry->tailoring->settings), 188 tailoring(entry->tailoring), 189 cacheEntry(entry), 190 validLocale(entry->validLocale), 191 explicitlySetAttributes(0), 192 actualLocaleIsSameAsValid(false) { 193 settings->addRef(); 194 cacheEntry->addRef(); 195 } 196 197 RuleBasedCollator::~RuleBasedCollator() { 198 SharedObject::clearPtr(settings); 199 SharedObject::clearPtr(cacheEntry); 200 } 201 202 void 203 RuleBasedCollator::adoptTailoring(CollationTailoring *t, UErrorCode &errorCode) { 204 if(U_FAILURE(errorCode)) { 205 t->deleteIfZeroRefCount(); 206 return; 207 } 208 U_ASSERT(settings == nullptr && data == nullptr && tailoring == nullptr && cacheEntry == nullptr); 209 cacheEntry = new CollationCacheEntry(t->actualLocale, t); 210 if(cacheEntry == nullptr) { 211 errorCode = U_MEMORY_ALLOCATION_ERROR; 212 t->deleteIfZeroRefCount(); 213 return; 214 } 215 data = t->data; 216 settings = t->settings; 217 settings->addRef(); 218 tailoring = t; 219 cacheEntry->addRef(); 220 validLocale = t->actualLocale; 221 actualLocaleIsSameAsValid = false; 222 } 223 224 RuleBasedCollator * 225 RuleBasedCollator::clone() const { 226 return new RuleBasedCollator(*this); 227 } 228 229 RuleBasedCollator &RuleBasedCollator::operator=(const RuleBasedCollator &other) { 230 if(this == &other) { return *this; } 231 SharedObject::copyPtr(other.settings, settings); 232 tailoring = other.tailoring; 233 SharedObject::copyPtr(other.cacheEntry, cacheEntry); 234 data = tailoring->data; 235 validLocale = other.validLocale; 236 explicitlySetAttributes = other.explicitlySetAttributes; 237 actualLocaleIsSameAsValid = other.actualLocaleIsSameAsValid; 238 return *this; 239 } 240 241 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedCollator) 242 243 bool 244 RuleBasedCollator::operator==(const Collator& other) const { 245 if(this == &other) { return true; } 246 if(!Collator::operator==(other)) { return false; } 247 const RuleBasedCollator &o = static_cast<const RuleBasedCollator &>(other); 248 if(*settings != *o.settings) { return false; } 249 if(data == o.data) { return true; } 250 UBool thisIsRoot = data->base == nullptr; 251 UBool otherIsRoot = o.data->base == nullptr; 252 U_ASSERT(!thisIsRoot || !otherIsRoot); // otherwise their data pointers should be == 253 if(thisIsRoot != otherIsRoot) { return false; } 254 if((thisIsRoot || !tailoring->rules.isEmpty()) && 255 (otherIsRoot || !o.tailoring->rules.isEmpty())) { 256 // Shortcut: If both collators have valid rule strings, then compare those. 257 if(tailoring->rules == o.tailoring->rules) { return true; } 258 } 259 // Different rule strings can result in the same or equivalent tailoring. 260 // The rule strings are optional in ICU resource bundles, although included by default. 261 // cloneBinary() drops the rule string. 262 UErrorCode errorCode = U_ZERO_ERROR; 263 LocalPointer<UnicodeSet> thisTailored(getTailoredSet(errorCode)); 264 LocalPointer<UnicodeSet> otherTailored(o.getTailoredSet(errorCode)); 265 if(U_FAILURE(errorCode)) { return false; } 266 if(*thisTailored != *otherTailored) { return false; } 267 // For completeness, we should compare all of the mappings; 268 // or we should create a list of strings, sort it with one collator, 269 // and check if both collators compare adjacent strings the same 270 // (order & strength, down to quaternary); or similar. 271 // Testing equality of collators seems unusual. 272 return true; 273 } 274 275 int32_t 276 RuleBasedCollator::hashCode() const { 277 int32_t h = settings->hashCode(); 278 if(data->base == nullptr) { return h; } // root collator 279 // Do not rely on the rule string, see comments in operator==(). 280 UErrorCode errorCode = U_ZERO_ERROR; 281 LocalPointer<UnicodeSet> set(getTailoredSet(errorCode)); 282 if(U_FAILURE(errorCode)) { return 0; } 283 UnicodeSetIterator iter(*set); 284 while(iter.next() && !iter.isString()) { 285 h ^= data->getCE32(iter.getCodepoint()); 286 } 287 return h; 288 } 289 290 void 291 RuleBasedCollator::setLocales(const Locale &requested, const Locale &valid, 292 const Locale &actual) { 293 if(actual == tailoring->actualLocale) { 294 actualLocaleIsSameAsValid = false; 295 } else { 296 U_ASSERT(actual == valid); 297 actualLocaleIsSameAsValid = true; 298 } 299 // Do not modify tailoring.actualLocale: 300 // We cannot be sure that that would be thread-safe. 301 validLocale = valid; 302 (void)requested; // Ignore, see also ticket #10477. 303 } 304 305 Locale 306 RuleBasedCollator::getLocale(ULocDataLocaleType type, UErrorCode& errorCode) const { 307 if(U_FAILURE(errorCode)) { 308 return Locale::getRoot(); 309 } 310 switch(type) { 311 case ULOC_ACTUAL_LOCALE: 312 return actualLocaleIsSameAsValid ? validLocale : tailoring->actualLocale; 313 case ULOC_VALID_LOCALE: 314 return validLocale; 315 case ULOC_REQUESTED_LOCALE: 316 default: 317 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 318 return Locale::getRoot(); 319 } 320 } 321 322 const char * 323 RuleBasedCollator::internalGetLocaleID(ULocDataLocaleType type, UErrorCode &errorCode) const { 324 if(U_FAILURE(errorCode)) { 325 return nullptr; 326 } 327 const Locale *result; 328 switch(type) { 329 case ULOC_ACTUAL_LOCALE: 330 result = actualLocaleIsSameAsValid ? &validLocale : &tailoring->actualLocale; 331 break; 332 case ULOC_VALID_LOCALE: 333 result = &validLocale; 334 break; 335 case ULOC_REQUESTED_LOCALE: 336 default: 337 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 338 return nullptr; 339 } 340 if(result->isBogus()) { return nullptr; } 341 const char *id = result->getName(); 342 return id[0] == 0 ? "root" : id; 343 } 344 345 const UnicodeString& 346 RuleBasedCollator::getRules() const { 347 return tailoring->rules; 348 } 349 350 void 351 RuleBasedCollator::getRules(UColRuleOption delta, UnicodeString &buffer) const { 352 if(delta == UCOL_TAILORING_ONLY) { 353 buffer = tailoring->rules; 354 return; 355 } 356 // UCOL_FULL_RULES 357 buffer.remove(); 358 CollationLoader::appendRootRules(buffer); 359 buffer.append(tailoring->rules).getTerminatedBuffer(); 360 } 361 362 void 363 RuleBasedCollator::getVersion(UVersionInfo version) const { 364 uprv_memcpy(version, tailoring->version, U_MAX_VERSION_LENGTH); 365 version[0] += (UCOL_RUNTIME_VERSION << 4) + (UCOL_RUNTIME_VERSION >> 4); 366 } 367 368 UnicodeSet * 369 RuleBasedCollator::getTailoredSet(UErrorCode &errorCode) const { 370 if(U_FAILURE(errorCode)) { return nullptr; } 371 UnicodeSet *tailored = new UnicodeSet(); 372 if(tailored == nullptr) { 373 errorCode = U_MEMORY_ALLOCATION_ERROR; 374 return nullptr; 375 } 376 if(data->base != nullptr) { 377 TailoredSet(tailored).forData(data, errorCode); 378 if(U_FAILURE(errorCode)) { 379 delete tailored; 380 return nullptr; 381 } 382 } 383 return tailored; 384 } 385 386 void 387 RuleBasedCollator::internalGetContractionsAndExpansions( 388 UnicodeSet *contractions, UnicodeSet *expansions, 389 UBool addPrefixes, UErrorCode &errorCode) const { 390 if(U_FAILURE(errorCode)) { return; } 391 if(contractions != nullptr) { 392 contractions->clear(); 393 } 394 if(expansions != nullptr) { 395 expansions->clear(); 396 } 397 ContractionsAndExpansions(contractions, expansions, nullptr, addPrefixes).forData(data, errorCode); 398 } 399 400 void 401 RuleBasedCollator::internalAddContractions(UChar32 c, UnicodeSet &set, UErrorCode &errorCode) const { 402 if(U_FAILURE(errorCode)) { return; } 403 ContractionsAndExpansions(&set, nullptr, nullptr, false).forCodePoint(data, c, errorCode); 404 } 405 406 const CollationSettings & 407 RuleBasedCollator::getDefaultSettings() const { 408 return *tailoring->settings; 409 } 410 411 UColAttributeValue 412 RuleBasedCollator::getAttribute(UColAttribute attr, UErrorCode &errorCode) const { 413 if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; } 414 int32_t option; 415 switch(attr) { 416 case UCOL_FRENCH_COLLATION: 417 option = CollationSettings::BACKWARD_SECONDARY; 418 break; 419 case UCOL_ALTERNATE_HANDLING: 420 return settings->getAlternateHandling(); 421 case UCOL_CASE_FIRST: 422 return settings->getCaseFirst(); 423 case UCOL_CASE_LEVEL: 424 option = CollationSettings::CASE_LEVEL; 425 break; 426 case UCOL_NORMALIZATION_MODE: 427 option = CollationSettings::CHECK_FCD; 428 break; 429 case UCOL_STRENGTH: 430 return static_cast<UColAttributeValue>(settings->getStrength()); 431 case UCOL_HIRAGANA_QUATERNARY_MODE: 432 // Deprecated attribute, unsettable. 433 return UCOL_OFF; 434 case UCOL_NUMERIC_COLLATION: 435 option = CollationSettings::NUMERIC; 436 break; 437 default: 438 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 439 return UCOL_DEFAULT; 440 } 441 return ((settings->options & option) == 0) ? UCOL_OFF : UCOL_ON; 442 } 443 444 void 445 RuleBasedCollator::setAttribute(UColAttribute attr, UColAttributeValue value, 446 UErrorCode &errorCode) { 447 UColAttributeValue oldValue = getAttribute(attr, errorCode); 448 if(U_FAILURE(errorCode)) { return; } 449 if(value == oldValue) { 450 setAttributeExplicitly(attr); 451 return; 452 } 453 const CollationSettings &defaultSettings = getDefaultSettings(); 454 if(settings == &defaultSettings) { 455 if(value == UCOL_DEFAULT) { 456 setAttributeDefault(attr); 457 return; 458 } 459 } 460 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings); 461 if(ownedSettings == nullptr) { 462 errorCode = U_MEMORY_ALLOCATION_ERROR; 463 return; 464 } 465 466 switch(attr) { 467 case UCOL_FRENCH_COLLATION: 468 ownedSettings->setFlag(CollationSettings::BACKWARD_SECONDARY, value, 469 defaultSettings.options, errorCode); 470 break; 471 case UCOL_ALTERNATE_HANDLING: 472 ownedSettings->setAlternateHandling(value, defaultSettings.options, errorCode); 473 break; 474 case UCOL_CASE_FIRST: 475 ownedSettings->setCaseFirst(value, defaultSettings.options, errorCode); 476 break; 477 case UCOL_CASE_LEVEL: 478 ownedSettings->setFlag(CollationSettings::CASE_LEVEL, value, 479 defaultSettings.options, errorCode); 480 break; 481 case UCOL_NORMALIZATION_MODE: 482 ownedSettings->setFlag(CollationSettings::CHECK_FCD, value, 483 defaultSettings.options, errorCode); 484 break; 485 case UCOL_STRENGTH: 486 ownedSettings->setStrength(value, defaultSettings.options, errorCode); 487 break; 488 case UCOL_HIRAGANA_QUATERNARY_MODE: 489 // Deprecated attribute. Check for valid values but do not change anything. 490 if(value != UCOL_OFF && value != UCOL_ON && value != UCOL_DEFAULT) { 491 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 492 } 493 break; 494 case UCOL_NUMERIC_COLLATION: 495 ownedSettings->setFlag(CollationSettings::NUMERIC, value, defaultSettings.options, errorCode); 496 break; 497 default: 498 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 499 break; 500 } 501 if(U_FAILURE(errorCode)) { return; } 502 setFastLatinOptions(*ownedSettings); 503 if(value == UCOL_DEFAULT) { 504 setAttributeDefault(attr); 505 } else { 506 setAttributeExplicitly(attr); 507 } 508 } 509 510 Collator & 511 RuleBasedCollator::setMaxVariable(UColReorderCode group, UErrorCode &errorCode) { 512 if(U_FAILURE(errorCode)) { return *this; } 513 // Convert the reorder code into a MaxVariable number, or UCOL_DEFAULT=-1. 514 int32_t value; 515 if(group == UCOL_REORDER_CODE_DEFAULT) { 516 value = UCOL_DEFAULT; 517 } else if(UCOL_REORDER_CODE_FIRST <= group && group <= UCOL_REORDER_CODE_CURRENCY) { 518 value = group - UCOL_REORDER_CODE_FIRST; 519 } else { 520 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 521 return *this; 522 } 523 CollationSettings::MaxVariable oldValue = settings->getMaxVariable(); 524 if(value == oldValue) { 525 setAttributeExplicitly(ATTR_VARIABLE_TOP); 526 return *this; 527 } 528 const CollationSettings &defaultSettings = getDefaultSettings(); 529 if(settings == &defaultSettings) { 530 if(value == UCOL_DEFAULT) { 531 setAttributeDefault(ATTR_VARIABLE_TOP); 532 return *this; 533 } 534 } 535 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings); 536 if(ownedSettings == nullptr) { 537 errorCode = U_MEMORY_ALLOCATION_ERROR; 538 return *this; 539 } 540 541 if(group == UCOL_REORDER_CODE_DEFAULT) { 542 group = static_cast<UColReorderCode>( 543 UCOL_REORDER_CODE_FIRST + int32_t{defaultSettings.getMaxVariable()}); 544 } 545 uint32_t varTop = data->getLastPrimaryForGroup(group); 546 U_ASSERT(varTop != 0); 547 ownedSettings->setMaxVariable(value, defaultSettings.options, errorCode); 548 if(U_FAILURE(errorCode)) { return *this; } 549 ownedSettings->variableTop = varTop; 550 setFastLatinOptions(*ownedSettings); 551 if(value == UCOL_DEFAULT) { 552 setAttributeDefault(ATTR_VARIABLE_TOP); 553 } else { 554 setAttributeExplicitly(ATTR_VARIABLE_TOP); 555 } 556 return *this; 557 } 558 559 UColReorderCode 560 RuleBasedCollator::getMaxVariable() const { 561 return static_cast<UColReorderCode>(UCOL_REORDER_CODE_FIRST + int32_t{settings->getMaxVariable()}); 562 } 563 564 uint32_t 565 RuleBasedCollator::getVariableTop(UErrorCode & /*errorCode*/) const { 566 return settings->variableTop; 567 } 568 569 uint32_t 570 RuleBasedCollator::setVariableTop(const char16_t *varTop, int32_t len, UErrorCode &errorCode) { 571 if(U_FAILURE(errorCode)) { return 0; } 572 if(varTop == nullptr && len !=0) { 573 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 574 return 0; 575 } 576 if(len < 0) { len = u_strlen(varTop); } 577 if(len == 0) { 578 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 579 return 0; 580 } 581 UBool numeric = settings->isNumeric(); 582 int64_t ce1, ce2; 583 if(settings->dontCheckFCD()) { 584 UTF16CollationIterator ci(data, numeric, varTop, varTop, varTop + len); 585 ce1 = ci.nextCE(errorCode); 586 ce2 = ci.nextCE(errorCode); 587 } else { 588 FCDUTF16CollationIterator ci(data, numeric, varTop, varTop, varTop + len); 589 ce1 = ci.nextCE(errorCode); 590 ce2 = ci.nextCE(errorCode); 591 } 592 if(ce1 == Collation::NO_CE || ce2 != Collation::NO_CE) { 593 errorCode = U_CE_NOT_FOUND_ERROR; 594 return 0; 595 } 596 setVariableTop(static_cast<uint32_t>(ce1 >> 32), errorCode); 597 return settings->variableTop; 598 } 599 600 uint32_t 601 RuleBasedCollator::setVariableTop(const UnicodeString &varTop, UErrorCode &errorCode) { 602 return setVariableTop(varTop.getBuffer(), varTop.length(), errorCode); 603 } 604 605 void 606 RuleBasedCollator::setVariableTop(uint32_t varTop, UErrorCode &errorCode) { 607 if(U_FAILURE(errorCode)) { return; } 608 if(varTop != settings->variableTop) { 609 // Pin the variable top to the end of the reordering group which contains it. 610 // Only a few special groups are supported. 611 int32_t group = data->getGroupForPrimary(varTop); 612 if(group < UCOL_REORDER_CODE_FIRST || UCOL_REORDER_CODE_CURRENCY < group) { 613 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 614 return; 615 } 616 uint32_t v = data->getLastPrimaryForGroup(group); 617 U_ASSERT(v != 0 && v >= varTop); 618 varTop = v; 619 if(varTop != settings->variableTop) { 620 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings); 621 if(ownedSettings == nullptr) { 622 errorCode = U_MEMORY_ALLOCATION_ERROR; 623 return; 624 } 625 ownedSettings->setMaxVariable(group - UCOL_REORDER_CODE_FIRST, 626 getDefaultSettings().options, errorCode); 627 if(U_FAILURE(errorCode)) { return; } 628 ownedSettings->variableTop = varTop; 629 setFastLatinOptions(*ownedSettings); 630 } 631 } 632 if(varTop == getDefaultSettings().variableTop) { 633 setAttributeDefault(ATTR_VARIABLE_TOP); 634 } else { 635 setAttributeExplicitly(ATTR_VARIABLE_TOP); 636 } 637 } 638 639 int32_t 640 RuleBasedCollator::getReorderCodes(int32_t *dest, int32_t capacity, 641 UErrorCode &errorCode) const { 642 if(U_FAILURE(errorCode)) { return 0; } 643 if(capacity < 0 || (dest == nullptr && capacity > 0)) { 644 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 645 return 0; 646 } 647 int32_t length = settings->reorderCodesLength; 648 if(length == 0) { return 0; } 649 if(length > capacity) { 650 errorCode = U_BUFFER_OVERFLOW_ERROR; 651 return length; 652 } 653 uprv_memcpy(dest, settings->reorderCodes, length * 4); 654 return length; 655 } 656 657 void 658 RuleBasedCollator::setReorderCodes(const int32_t *reorderCodes, int32_t length, 659 UErrorCode &errorCode) { 660 if(U_FAILURE(errorCode)) { return; } 661 if(length < 0 || (reorderCodes == nullptr && length > 0)) { 662 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 663 return; 664 } 665 if(length == 1 && reorderCodes[0] == UCOL_REORDER_CODE_NONE) { 666 length = 0; 667 } 668 if(length == settings->reorderCodesLength && 669 uprv_memcmp(reorderCodes, settings->reorderCodes, length * 4) == 0) { 670 return; 671 } 672 const CollationSettings &defaultSettings = getDefaultSettings(); 673 if(length == 1 && reorderCodes[0] == UCOL_REORDER_CODE_DEFAULT) { 674 if(settings != &defaultSettings) { 675 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings); 676 if(ownedSettings == nullptr) { 677 errorCode = U_MEMORY_ALLOCATION_ERROR; 678 return; 679 } 680 ownedSettings->copyReorderingFrom(defaultSettings, errorCode); 681 setFastLatinOptions(*ownedSettings); 682 } 683 return; 684 } 685 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings); 686 if(ownedSettings == nullptr) { 687 errorCode = U_MEMORY_ALLOCATION_ERROR; 688 return; 689 } 690 ownedSettings->setReordering(*data, reorderCodes, length, errorCode); 691 setFastLatinOptions(*ownedSettings); 692 } 693 694 void 695 RuleBasedCollator::setFastLatinOptions(CollationSettings &ownedSettings) const { 696 ownedSettings.fastLatinOptions = CollationFastLatin::getOptions( 697 data, ownedSettings, 698 ownedSettings.fastLatinPrimaries, UPRV_LENGTHOF(ownedSettings.fastLatinPrimaries)); 699 } 700 701 UCollationResult 702 RuleBasedCollator::compare(const UnicodeString &left, const UnicodeString &right, 703 UErrorCode &errorCode) const { 704 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; } 705 return doCompare(left.getBuffer(), left.length(), 706 right.getBuffer(), right.length(), errorCode); 707 } 708 709 UCollationResult 710 RuleBasedCollator::compare(const UnicodeString &left, const UnicodeString &right, 711 int32_t length, UErrorCode &errorCode) const { 712 if(U_FAILURE(errorCode) || length == 0) { return UCOL_EQUAL; } 713 if(length < 0) { 714 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 715 return UCOL_EQUAL; 716 } 717 int32_t leftLength = left.length(); 718 int32_t rightLength = right.length(); 719 if(leftLength > length) { leftLength = length; } 720 if(rightLength > length) { rightLength = length; } 721 return doCompare(left.getBuffer(), leftLength, 722 right.getBuffer(), rightLength, errorCode); 723 } 724 725 UCollationResult 726 RuleBasedCollator::compare(const char16_t *left, int32_t leftLength, 727 const char16_t *right, int32_t rightLength, 728 UErrorCode &errorCode) const { 729 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; } 730 if((left == nullptr && leftLength != 0) || (right == nullptr && rightLength != 0)) { 731 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 732 return UCOL_EQUAL; 733 } 734 // Make sure both or neither strings have a known length. 735 // We do not optimize for mixed length/termination. 736 if(leftLength >= 0) { 737 if(rightLength < 0) { rightLength = u_strlen(right); } 738 } else { 739 if(rightLength >= 0) { leftLength = u_strlen(left); } 740 } 741 return doCompare(left, leftLength, right, rightLength, errorCode); 742 } 743 744 UCollationResult 745 RuleBasedCollator::compareUTF8(const StringPiece &left, const StringPiece &right, 746 UErrorCode &errorCode) const { 747 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; } 748 const uint8_t *leftBytes = reinterpret_cast<const uint8_t *>(left.data()); 749 const uint8_t *rightBytes = reinterpret_cast<const uint8_t *>(right.data()); 750 if((leftBytes == nullptr && !left.empty()) || (rightBytes == nullptr && !right.empty())) { 751 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 752 return UCOL_EQUAL; 753 } 754 return doCompare(leftBytes, left.length(), rightBytes, right.length(), errorCode); 755 } 756 757 UCollationResult 758 RuleBasedCollator::internalCompareUTF8(const char *left, int32_t leftLength, 759 const char *right, int32_t rightLength, 760 UErrorCode &errorCode) const { 761 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; } 762 if((left == nullptr && leftLength != 0) || (right == nullptr && rightLength != 0)) { 763 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 764 return UCOL_EQUAL; 765 } 766 // Make sure both or neither strings have a known length. 767 // We do not optimize for mixed length/termination. 768 if(leftLength >= 0) { 769 if(rightLength < 0) { rightLength = static_cast<int32_t>(uprv_strlen(right)); } 770 } else { 771 if(rightLength >= 0) { leftLength = static_cast<int32_t>(uprv_strlen(left)); } 772 } 773 return doCompare(reinterpret_cast<const uint8_t *>(left), leftLength, 774 reinterpret_cast<const uint8_t *>(right), rightLength, errorCode); 775 } 776 777 namespace { 778 779 /** 780 * Abstract iterator for identical-level string comparisons. 781 * Returns FCD code points and handles temporary switching to NFD. 782 */ 783 class NFDIterator : public UObject { 784 public: 785 NFDIterator() : index(-1), length(0) {} 786 virtual ~NFDIterator() {} 787 /** 788 * Returns the next code point from the internal normalization buffer, 789 * or else the next text code point. 790 * Returns -1 at the end of the text. 791 */ 792 UChar32 nextCodePoint() { 793 if(index >= 0) { 794 if(index == length) { 795 index = -1; 796 } else { 797 UChar32 c; 798 U16_NEXT_UNSAFE(decomp, index, c); 799 return c; 800 } 801 } 802 return nextRawCodePoint(); 803 } 804 /** 805 * @param nfcImpl 806 * @param c the last code point returned by nextCodePoint() or nextDecomposedCodePoint() 807 * @return the first code point in c's decomposition, 808 * or c itself if it was decomposed already or if it does not decompose 809 */ 810 UChar32 nextDecomposedCodePoint(const Normalizer2Impl &nfcImpl, UChar32 c) { 811 if(index >= 0) { return c; } 812 decomp = nfcImpl.getDecomposition(c, buffer, length); 813 if(decomp == nullptr) { return c; } 814 index = 0; 815 U16_NEXT_UNSAFE(decomp, index, c); 816 return c; 817 } 818 protected: 819 /** 820 * Returns the next text code point in FCD order. 821 * Returns -1 at the end of the text. 822 */ 823 virtual UChar32 nextRawCodePoint() = 0; 824 private: 825 const char16_t *decomp; 826 char16_t buffer[4]; 827 int32_t index; 828 int32_t length; 829 }; 830 831 class UTF16NFDIterator : public NFDIterator { 832 public: 833 UTF16NFDIterator(const char16_t *text, const char16_t *textLimit) : s(text), limit(textLimit) {} 834 protected: 835 virtual UChar32 nextRawCodePoint() override { 836 if(s == limit) { return U_SENTINEL; } 837 UChar32 c = *s++; 838 if(limit == nullptr && c == 0) { 839 s = nullptr; 840 return U_SENTINEL; 841 } 842 char16_t trail; 843 if(U16_IS_LEAD(c) && s != limit && U16_IS_TRAIL(trail = *s)) { 844 ++s; 845 c = U16_GET_SUPPLEMENTARY(c, trail); 846 } 847 return c; 848 } 849 850 const char16_t *s; 851 const char16_t *limit; 852 }; 853 854 class FCDUTF16NFDIterator : public UTF16NFDIterator { 855 public: 856 FCDUTF16NFDIterator(const Normalizer2Impl &nfcImpl, const char16_t *text, const char16_t *textLimit) 857 : UTF16NFDIterator(nullptr, nullptr) { 858 UErrorCode errorCode = U_ZERO_ERROR; 859 const char16_t *spanLimit = nfcImpl.makeFCD(text, textLimit, nullptr, errorCode); 860 if(U_FAILURE(errorCode)) { return; } 861 if(spanLimit == textLimit || (textLimit == nullptr && *spanLimit == 0)) { 862 s = text; 863 limit = spanLimit; 864 } else { 865 str.setTo(text, static_cast<int32_t>(spanLimit - text)); 866 { 867 ReorderingBuffer r_buffer(nfcImpl, str); 868 if(r_buffer.init(str.length(), errorCode)) { 869 nfcImpl.makeFCD(spanLimit, textLimit, &r_buffer, errorCode); 870 } 871 } 872 if(U_SUCCESS(errorCode)) { 873 s = str.getBuffer(); 874 limit = s + str.length(); 875 } 876 } 877 } 878 private: 879 UnicodeString str; 880 }; 881 882 class UTF8NFDIterator : public NFDIterator { 883 public: 884 UTF8NFDIterator(const uint8_t *text, int32_t textLength) 885 : s(text), pos(0), length(textLength) {} 886 protected: 887 virtual UChar32 nextRawCodePoint() override { 888 if(pos == length || (s[pos] == 0 && length < 0)) { return U_SENTINEL; } 889 UChar32 c; 890 U8_NEXT_OR_FFFD(s, pos, length, c); 891 return c; 892 } 893 894 const uint8_t *s; 895 int32_t pos; 896 int32_t length; 897 }; 898 899 class FCDUTF8NFDIterator : public NFDIterator { 900 public: 901 FCDUTF8NFDIterator(const CollationData *data, const uint8_t *text, int32_t textLength) 902 : u8ci(data, false, text, 0, textLength) {} 903 protected: 904 virtual UChar32 nextRawCodePoint() override { 905 UErrorCode errorCode = U_ZERO_ERROR; 906 return u8ci.nextCodePoint(errorCode); 907 } 908 private: 909 FCDUTF8CollationIterator u8ci; 910 }; 911 912 class UIterNFDIterator : public NFDIterator { 913 public: 914 UIterNFDIterator(UCharIterator &it) : iter(it) {} 915 protected: 916 virtual UChar32 nextRawCodePoint() override { 917 return uiter_next32(&iter); 918 } 919 private: 920 UCharIterator &iter; 921 }; 922 923 class FCDUIterNFDIterator : public NFDIterator { 924 public: 925 FCDUIterNFDIterator(const CollationData *data, UCharIterator &it, int32_t startIndex) 926 : uici(data, false, it, startIndex) {} 927 protected: 928 virtual UChar32 nextRawCodePoint() override { 929 UErrorCode errorCode = U_ZERO_ERROR; 930 return uici.nextCodePoint(errorCode); 931 } 932 private: 933 FCDUIterCollationIterator uici; 934 }; 935 936 UCollationResult compareNFDIter(const Normalizer2Impl &nfcImpl, 937 NFDIterator &left, NFDIterator &right) { 938 for(;;) { 939 // Fetch the next FCD code point from each string. 940 UChar32 leftCp = left.nextCodePoint(); 941 UChar32 rightCp = right.nextCodePoint(); 942 if(leftCp == rightCp) { 943 if(leftCp < 0) { break; } 944 continue; 945 } 946 // If they are different, then decompose each and compare again. 947 if(leftCp < 0) { 948 leftCp = -2; // end of string 949 } else if(leftCp == 0xfffe) { 950 leftCp = -1; // U+FFFE: merge separator 951 } else { 952 leftCp = left.nextDecomposedCodePoint(nfcImpl, leftCp); 953 } 954 if(rightCp < 0) { 955 rightCp = -2; // end of string 956 } else if(rightCp == 0xfffe) { 957 rightCp = -1; // U+FFFE: merge separator 958 } else { 959 rightCp = right.nextDecomposedCodePoint(nfcImpl, rightCp); 960 } 961 if(leftCp < rightCp) { return UCOL_LESS; } 962 if(leftCp > rightCp) { return UCOL_GREATER; } 963 } 964 return UCOL_EQUAL; 965 } 966 967 } // namespace 968 969 UCollationResult 970 RuleBasedCollator::doCompare(const char16_t *left, int32_t leftLength, 971 const char16_t *right, int32_t rightLength, 972 UErrorCode &errorCode) const { 973 // U_FAILURE(errorCode) checked by caller. 974 if(left == right && leftLength == rightLength) { 975 return UCOL_EQUAL; 976 } 977 978 // Identical-prefix test. 979 const char16_t *leftLimit; 980 const char16_t *rightLimit; 981 int32_t equalPrefixLength = 0; 982 if(leftLength < 0) { 983 leftLimit = nullptr; 984 rightLimit = nullptr; 985 char16_t c; 986 while((c = left[equalPrefixLength]) == right[equalPrefixLength]) { 987 if(c == 0) { return UCOL_EQUAL; } 988 ++equalPrefixLength; 989 } 990 } else { 991 leftLimit = left + leftLength; 992 rightLimit = right + rightLength; 993 for(;;) { 994 if(equalPrefixLength == leftLength) { 995 if(equalPrefixLength == rightLength) { return UCOL_EQUAL; } 996 break; 997 } else if(equalPrefixLength == rightLength || 998 left[equalPrefixLength] != right[equalPrefixLength]) { 999 break; 1000 } 1001 ++equalPrefixLength; 1002 } 1003 } 1004 1005 UBool numeric = settings->isNumeric(); 1006 if(equalPrefixLength > 0) { 1007 if((equalPrefixLength != leftLength && 1008 data->isUnsafeBackward(left[equalPrefixLength], numeric)) || 1009 (equalPrefixLength != rightLength && 1010 data->isUnsafeBackward(right[equalPrefixLength], numeric))) { 1011 // Identical prefix: Back up to the start of a contraction or reordering sequence. 1012 while(--equalPrefixLength > 0 && 1013 data->isUnsafeBackward(left[equalPrefixLength], numeric)) {} 1014 } 1015 // Notes: 1016 // - A longer string can compare equal to a prefix of it if only ignorables follow. 1017 // - With a backward level, a longer string can compare less-than a prefix of it. 1018 1019 // Pass the actual start of each string into the CollationIterators, 1020 // plus the equalPrefixLength position, 1021 // so that prefix matches back into the equal prefix work. 1022 } 1023 1024 int32_t result; 1025 int32_t fastLatinOptions = settings->fastLatinOptions; 1026 if(fastLatinOptions >= 0 && 1027 (equalPrefixLength == leftLength || 1028 left[equalPrefixLength] <= CollationFastLatin::LATIN_MAX) && 1029 (equalPrefixLength == rightLength || 1030 right[equalPrefixLength] <= CollationFastLatin::LATIN_MAX)) { 1031 if(leftLength >= 0) { 1032 result = CollationFastLatin::compareUTF16(data->fastLatinTable, 1033 settings->fastLatinPrimaries, 1034 fastLatinOptions, 1035 left + equalPrefixLength, 1036 leftLength - equalPrefixLength, 1037 right + equalPrefixLength, 1038 rightLength - equalPrefixLength); 1039 } else { 1040 result = CollationFastLatin::compareUTF16(data->fastLatinTable, 1041 settings->fastLatinPrimaries, 1042 fastLatinOptions, 1043 left + equalPrefixLength, -1, 1044 right + equalPrefixLength, -1); 1045 } 1046 } else { 1047 result = CollationFastLatin::BAIL_OUT_RESULT; 1048 } 1049 1050 if(result == CollationFastLatin::BAIL_OUT_RESULT) { 1051 if(settings->dontCheckFCD()) { 1052 UTF16CollationIterator leftIter(data, numeric, 1053 left, left + equalPrefixLength, leftLimit); 1054 UTF16CollationIterator rightIter(data, numeric, 1055 right, right + equalPrefixLength, rightLimit); 1056 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode); 1057 } else { 1058 FCDUTF16CollationIterator leftIter(data, numeric, 1059 left, left + equalPrefixLength, leftLimit); 1060 FCDUTF16CollationIterator rightIter(data, numeric, 1061 right, right + equalPrefixLength, rightLimit); 1062 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode); 1063 } 1064 } 1065 if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) { 1066 return static_cast<UCollationResult>(result); 1067 } 1068 1069 // Note: If NUL-terminated, we could get the actual limits from the iterators now. 1070 // That would complicate the iterators a bit, NUL-terminated strings are only a C convenience, 1071 // and the benefit seems unlikely to be measurable. 1072 1073 // Compare identical level. 1074 const Normalizer2Impl &nfcImpl = data->nfcImpl; 1075 left += equalPrefixLength; 1076 right += equalPrefixLength; 1077 if(settings->dontCheckFCD()) { 1078 UTF16NFDIterator leftIter(left, leftLimit); 1079 UTF16NFDIterator rightIter(right, rightLimit); 1080 return compareNFDIter(nfcImpl, leftIter, rightIter); 1081 } else { 1082 FCDUTF16NFDIterator leftIter(nfcImpl, left, leftLimit); 1083 FCDUTF16NFDIterator rightIter(nfcImpl, right, rightLimit); 1084 return compareNFDIter(nfcImpl, leftIter, rightIter); 1085 } 1086 } 1087 1088 UCollationResult 1089 RuleBasedCollator::doCompare(const uint8_t *left, int32_t leftLength, 1090 const uint8_t *right, int32_t rightLength, 1091 UErrorCode &errorCode) const { 1092 // U_FAILURE(errorCode) checked by caller. 1093 if(left == right && leftLength == rightLength) { 1094 return UCOL_EQUAL; 1095 } 1096 1097 // Identical-prefix test. 1098 int32_t equalPrefixLength = 0; 1099 if(leftLength < 0) { 1100 uint8_t c; 1101 while((c = left[equalPrefixLength]) == right[equalPrefixLength]) { 1102 if(c == 0) { return UCOL_EQUAL; } 1103 ++equalPrefixLength; 1104 } 1105 } else { 1106 for(;;) { 1107 if(equalPrefixLength == leftLength) { 1108 if(equalPrefixLength == rightLength) { return UCOL_EQUAL; } 1109 break; 1110 } else if(equalPrefixLength == rightLength || 1111 left[equalPrefixLength] != right[equalPrefixLength]) { 1112 break; 1113 } 1114 ++equalPrefixLength; 1115 } 1116 } 1117 // Back up to the start of a partially-equal code point. 1118 if(equalPrefixLength > 0 && 1119 ((equalPrefixLength != leftLength && U8_IS_TRAIL(left[equalPrefixLength])) || 1120 (equalPrefixLength != rightLength && U8_IS_TRAIL(right[equalPrefixLength])))) { 1121 while(--equalPrefixLength > 0 && U8_IS_TRAIL(left[equalPrefixLength])) {} 1122 } 1123 1124 UBool numeric = settings->isNumeric(); 1125 if(equalPrefixLength > 0) { 1126 UBool unsafe = false; 1127 if(equalPrefixLength != leftLength) { 1128 int32_t i = equalPrefixLength; 1129 UChar32 c; 1130 U8_NEXT_OR_FFFD(left, i, leftLength, c); 1131 unsafe = data->isUnsafeBackward(c, numeric); 1132 } 1133 if(!unsafe && equalPrefixLength != rightLength) { 1134 int32_t i = equalPrefixLength; 1135 UChar32 c; 1136 U8_NEXT_OR_FFFD(right, i, rightLength, c); 1137 unsafe = data->isUnsafeBackward(c, numeric); 1138 } 1139 if(unsafe) { 1140 // Identical prefix: Back up to the start of a contraction or reordering sequence. 1141 UChar32 c; 1142 do { 1143 U8_PREV_OR_FFFD(left, 0, equalPrefixLength, c); 1144 } while(equalPrefixLength > 0 && data->isUnsafeBackward(c, numeric)); 1145 } 1146 // See the notes in the UTF-16 version. 1147 1148 // Pass the actual start of each string into the CollationIterators, 1149 // plus the equalPrefixLength position, 1150 // so that prefix matches back into the equal prefix work. 1151 } 1152 1153 int32_t result; 1154 int32_t fastLatinOptions = settings->fastLatinOptions; 1155 if(fastLatinOptions >= 0 && 1156 (equalPrefixLength == leftLength || 1157 left[equalPrefixLength] <= CollationFastLatin::LATIN_MAX_UTF8_LEAD) && 1158 (equalPrefixLength == rightLength || 1159 right[equalPrefixLength] <= CollationFastLatin::LATIN_MAX_UTF8_LEAD)) { 1160 if(leftLength >= 0) { 1161 result = CollationFastLatin::compareUTF8(data->fastLatinTable, 1162 settings->fastLatinPrimaries, 1163 fastLatinOptions, 1164 left + equalPrefixLength, 1165 leftLength - equalPrefixLength, 1166 right + equalPrefixLength, 1167 rightLength - equalPrefixLength); 1168 } else { 1169 result = CollationFastLatin::compareUTF8(data->fastLatinTable, 1170 settings->fastLatinPrimaries, 1171 fastLatinOptions, 1172 left + equalPrefixLength, -1, 1173 right + equalPrefixLength, -1); 1174 } 1175 } else { 1176 result = CollationFastLatin::BAIL_OUT_RESULT; 1177 } 1178 1179 if(result == CollationFastLatin::BAIL_OUT_RESULT) { 1180 if(settings->dontCheckFCD()) { 1181 UTF8CollationIterator leftIter(data, numeric, left, equalPrefixLength, leftLength); 1182 UTF8CollationIterator rightIter(data, numeric, right, equalPrefixLength, rightLength); 1183 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode); 1184 } else { 1185 FCDUTF8CollationIterator leftIter(data, numeric, left, equalPrefixLength, leftLength); 1186 FCDUTF8CollationIterator rightIter(data, numeric, right, equalPrefixLength, rightLength); 1187 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode); 1188 } 1189 } 1190 if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) { 1191 return static_cast<UCollationResult>(result); 1192 } 1193 1194 // Note: If NUL-terminated, we could get the actual limits from the iterators now. 1195 // That would complicate the iterators a bit, NUL-terminated strings are only a C convenience, 1196 // and the benefit seems unlikely to be measurable. 1197 1198 // Compare identical level. 1199 const Normalizer2Impl &nfcImpl = data->nfcImpl; 1200 left += equalPrefixLength; 1201 right += equalPrefixLength; 1202 if(leftLength > 0) { 1203 leftLength -= equalPrefixLength; 1204 rightLength -= equalPrefixLength; 1205 } 1206 if(settings->dontCheckFCD()) { 1207 UTF8NFDIterator leftIter(left, leftLength); 1208 UTF8NFDIterator rightIter(right, rightLength); 1209 return compareNFDIter(nfcImpl, leftIter, rightIter); 1210 } else { 1211 FCDUTF8NFDIterator leftIter(data, left, leftLength); 1212 FCDUTF8NFDIterator rightIter(data, right, rightLength); 1213 return compareNFDIter(nfcImpl, leftIter, rightIter); 1214 } 1215 } 1216 1217 UCollationResult 1218 RuleBasedCollator::compare(UCharIterator &left, UCharIterator &right, 1219 UErrorCode &errorCode) const { 1220 if(U_FAILURE(errorCode) || &left == &right) { return UCOL_EQUAL; } 1221 UBool numeric = settings->isNumeric(); 1222 1223 // Identical-prefix test. 1224 int32_t equalPrefixLength = 0; 1225 { 1226 UChar32 leftUnit; 1227 UChar32 rightUnit; 1228 while((leftUnit = left.next(&left)) == (rightUnit = right.next(&right))) { 1229 if(leftUnit < 0) { return UCOL_EQUAL; } 1230 ++equalPrefixLength; 1231 } 1232 1233 // Back out the code units that differed, for the real collation comparison. 1234 if(leftUnit >= 0) { left.previous(&left); } 1235 if(rightUnit >= 0) { right.previous(&right); } 1236 1237 if(equalPrefixLength > 0) { 1238 if((leftUnit >= 0 && data->isUnsafeBackward(leftUnit, numeric)) || 1239 (rightUnit >= 0 && data->isUnsafeBackward(rightUnit, numeric))) { 1240 // Identical prefix: Back up to the start of a contraction or reordering sequence. 1241 do { 1242 --equalPrefixLength; 1243 leftUnit = left.previous(&left); 1244 right.previous(&right); 1245 } while(equalPrefixLength > 0 && data->isUnsafeBackward(leftUnit, numeric)); 1246 } 1247 // See the notes in the UTF-16 version. 1248 } 1249 } 1250 1251 UCollationResult result; 1252 if(settings->dontCheckFCD()) { 1253 UIterCollationIterator leftIter(data, numeric, left); 1254 UIterCollationIterator rightIter(data, numeric, right); 1255 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode); 1256 } else { 1257 FCDUIterCollationIterator leftIter(data, numeric, left, equalPrefixLength); 1258 FCDUIterCollationIterator rightIter(data, numeric, right, equalPrefixLength); 1259 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode); 1260 } 1261 if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) { 1262 return result; 1263 } 1264 1265 // Compare identical level. 1266 left.move(&left, equalPrefixLength, UITER_ZERO); 1267 right.move(&right, equalPrefixLength, UITER_ZERO); 1268 const Normalizer2Impl &nfcImpl = data->nfcImpl; 1269 if(settings->dontCheckFCD()) { 1270 UIterNFDIterator leftIter(left); 1271 UIterNFDIterator rightIter(right); 1272 return compareNFDIter(nfcImpl, leftIter, rightIter); 1273 } else { 1274 FCDUIterNFDIterator leftIter(data, left, equalPrefixLength); 1275 FCDUIterNFDIterator rightIter(data, right, equalPrefixLength); 1276 return compareNFDIter(nfcImpl, leftIter, rightIter); 1277 } 1278 } 1279 1280 CollationKey & 1281 RuleBasedCollator::getCollationKey(const UnicodeString &s, CollationKey &key, 1282 UErrorCode &errorCode) const { 1283 return getCollationKey(s.getBuffer(), s.length(), key, errorCode); 1284 } 1285 1286 CollationKey & 1287 RuleBasedCollator::getCollationKey(const char16_t *s, int32_t length, CollationKey& key, 1288 UErrorCode &errorCode) const { 1289 if(U_FAILURE(errorCode)) { 1290 return key.setToBogus(); 1291 } 1292 if(s == nullptr && length != 0) { 1293 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 1294 return key.setToBogus(); 1295 } 1296 key.reset(); // resets the "bogus" state 1297 CollationKeyByteSink sink(key); 1298 writeSortKey(s, length, sink, errorCode); 1299 if(U_FAILURE(errorCode)) { 1300 key.setToBogus(); 1301 } else if(key.isBogus()) { 1302 errorCode = U_MEMORY_ALLOCATION_ERROR; 1303 } else { 1304 key.setLength(sink.NumberOfBytesAppended()); 1305 } 1306 return key; 1307 } 1308 1309 int32_t 1310 RuleBasedCollator::getSortKey(const UnicodeString &s, 1311 uint8_t *dest, int32_t capacity) const { 1312 return getSortKey(s.getBuffer(), s.length(), dest, capacity); 1313 } 1314 1315 int32_t 1316 RuleBasedCollator::getSortKey(const char16_t *s, int32_t length, 1317 uint8_t *dest, int32_t capacity) const { 1318 if((s == nullptr && length != 0) || capacity < 0 || (dest == nullptr && capacity > 0)) { 1319 return 0; 1320 } 1321 uint8_t noDest[1] = { 0 }; 1322 if(dest == nullptr) { 1323 // Distinguish pure preflighting from an allocation error. 1324 dest = noDest; 1325 capacity = 0; 1326 } 1327 FixedSortKeyByteSink sink(reinterpret_cast<char *>(dest), capacity); 1328 UErrorCode errorCode = U_ZERO_ERROR; 1329 writeSortKey(s, length, sink, errorCode); 1330 return U_SUCCESS(errorCode) ? sink.NumberOfBytesAppended() : 0; 1331 } 1332 1333 void 1334 RuleBasedCollator::writeSortKey(const char16_t *s, int32_t length, 1335 SortKeyByteSink &sink, UErrorCode &errorCode) const { 1336 if(U_FAILURE(errorCode)) { return; } 1337 const char16_t *limit = (length >= 0) ? s + length : nullptr; 1338 UBool numeric = settings->isNumeric(); 1339 CollationKeys::LevelCallback callback; 1340 if(settings->dontCheckFCD()) { 1341 UTF16CollationIterator iter(data, numeric, s, s, limit); 1342 CollationKeys::writeSortKeyUpToQuaternary(iter, data->compressibleBytes, *settings, 1343 sink, Collation::PRIMARY_LEVEL, 1344 callback, true, errorCode); 1345 } else { 1346 FCDUTF16CollationIterator iter(data, numeric, s, s, limit); 1347 CollationKeys::writeSortKeyUpToQuaternary(iter, data->compressibleBytes, *settings, 1348 sink, Collation::PRIMARY_LEVEL, 1349 callback, true, errorCode); 1350 } 1351 if(settings->getStrength() == UCOL_IDENTICAL) { 1352 writeIdenticalLevel(s, limit, sink, errorCode); 1353 } 1354 static const char terminator = 0; // TERMINATOR_BYTE 1355 sink.Append(&terminator, 1); 1356 } 1357 1358 void 1359 RuleBasedCollator::writeIdenticalLevel(const char16_t *s, const char16_t *limit, 1360 SortKeyByteSink &sink, UErrorCode &errorCode) const { 1361 // NFD quick check 1362 const char16_t *nfdQCYesLimit = data->nfcImpl.decompose(s, limit, nullptr, errorCode); 1363 if(U_FAILURE(errorCode)) { return; } 1364 sink.Append(Collation::LEVEL_SEPARATOR_BYTE); 1365 UChar32 prev = 0; 1366 if(nfdQCYesLimit != s) { 1367 prev = u_writeIdenticalLevelRun(prev, s, static_cast<int32_t>(nfdQCYesLimit - s), sink); 1368 } 1369 // Is there non-NFD text? 1370 int32_t destLengthEstimate; 1371 if(limit != nullptr) { 1372 if(nfdQCYesLimit == limit) { return; } 1373 destLengthEstimate = static_cast<int32_t>(limit - nfdQCYesLimit); 1374 } else { 1375 // s is NUL-terminated 1376 if(*nfdQCYesLimit == 0) { return; } 1377 destLengthEstimate = -1; 1378 } 1379 UnicodeString nfd; 1380 data->nfcImpl.decompose(nfdQCYesLimit, limit, nfd, destLengthEstimate, errorCode); 1381 u_writeIdenticalLevelRun(prev, nfd.getBuffer(), nfd.length(), sink); 1382 } 1383 1384 namespace { 1385 1386 /** 1387 * internalNextSortKeyPart() calls CollationKeys::writeSortKeyUpToQuaternary() 1388 * with an instance of this callback class. 1389 * When another level is about to be written, the callback 1390 * records the level and the number of bytes that will be written until 1391 * the sink (which is actually a FixedSortKeyByteSink) fills up. 1392 * 1393 * When internalNextSortKeyPart() is called again, it restarts with the last level 1394 * and ignores as many bytes as were written previously for that level. 1395 */ 1396 class PartLevelCallback : public CollationKeys::LevelCallback { 1397 public: 1398 PartLevelCallback(const SortKeyByteSink &s) 1399 : sink(s), level(Collation::PRIMARY_LEVEL) { 1400 levelCapacity = sink.GetRemainingCapacity(); 1401 } 1402 virtual ~PartLevelCallback() {} 1403 virtual UBool needToWrite(Collation::Level l) override { 1404 if(!sink.Overflowed()) { 1405 // Remember a level that will be at least partially written. 1406 level = l; 1407 levelCapacity = sink.GetRemainingCapacity(); 1408 return true; 1409 } else { 1410 return false; 1411 } 1412 } 1413 Collation::Level getLevel() const { return level; } 1414 int32_t getLevelCapacity() const { return levelCapacity; } 1415 1416 private: 1417 const SortKeyByteSink &sink; 1418 Collation::Level level; 1419 int32_t levelCapacity; 1420 }; 1421 1422 } // namespace 1423 1424 int32_t 1425 RuleBasedCollator::internalNextSortKeyPart(UCharIterator *iter, uint32_t state[2], 1426 uint8_t *dest, int32_t count, UErrorCode &errorCode) const { 1427 if(U_FAILURE(errorCode)) { return 0; } 1428 if(iter == nullptr || state == nullptr || count < 0 || (count > 0 && dest == nullptr)) { 1429 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 1430 return 0; 1431 } 1432 if(count == 0) { return 0; } 1433 1434 FixedSortKeyByteSink sink(reinterpret_cast<char *>(dest), count); 1435 sink.IgnoreBytes(static_cast<int32_t>(state[1])); 1436 iter->move(iter, 0, UITER_START); 1437 1438 Collation::Level level = static_cast<Collation::Level>(state[0]); 1439 if(level <= Collation::QUATERNARY_LEVEL) { 1440 UBool numeric = settings->isNumeric(); 1441 PartLevelCallback callback(sink); 1442 if(settings->dontCheckFCD()) { 1443 UIterCollationIterator ci(data, numeric, *iter); 1444 CollationKeys::writeSortKeyUpToQuaternary(ci, data->compressibleBytes, *settings, 1445 sink, level, callback, false, errorCode); 1446 } else { 1447 FCDUIterCollationIterator ci(data, numeric, *iter, 0); 1448 CollationKeys::writeSortKeyUpToQuaternary(ci, data->compressibleBytes, *settings, 1449 sink, level, callback, false, errorCode); 1450 } 1451 if(U_FAILURE(errorCode)) { return 0; } 1452 if(sink.NumberOfBytesAppended() > count) { 1453 state[0] = static_cast<uint32_t>(callback.getLevel()); 1454 state[1] = static_cast<uint32_t>(callback.getLevelCapacity()); 1455 return count; 1456 } 1457 // All of the normal levels are done. 1458 if(settings->getStrength() == UCOL_IDENTICAL) { 1459 level = Collation::IDENTICAL_LEVEL; 1460 iter->move(iter, 0, UITER_START); 1461 } 1462 // else fall through to setting ZERO_LEVEL 1463 } 1464 1465 if(level == Collation::IDENTICAL_LEVEL) { 1466 int32_t levelCapacity = sink.GetRemainingCapacity(); 1467 UnicodeString s; 1468 for(;;) { 1469 UChar32 c = iter->next(iter); 1470 if(c < 0) { break; } 1471 s.append(static_cast<char16_t>(c)); 1472 } 1473 const char16_t *sArray = s.getBuffer(); 1474 writeIdenticalLevel(sArray, sArray + s.length(), sink, errorCode); 1475 if(U_FAILURE(errorCode)) { return 0; } 1476 if(sink.NumberOfBytesAppended() > count) { 1477 state[0] = static_cast<uint32_t>(level); 1478 state[1] = static_cast<uint32_t>(levelCapacity); 1479 return count; 1480 } 1481 } 1482 1483 // ZERO_LEVEL: Fill the remainder of dest with 00 bytes. 1484 state[0] = static_cast<uint32_t>(Collation::ZERO_LEVEL); 1485 state[1] = 0; 1486 int32_t length = sink.NumberOfBytesAppended(); 1487 int32_t i = length; 1488 while(i < count) { dest[i++] = 0; } 1489 return length; 1490 } 1491 1492 void 1493 RuleBasedCollator::internalGetCEs(const UnicodeString &str, UVector64 &ces, 1494 UErrorCode &errorCode) const { 1495 if(U_FAILURE(errorCode)) { return; } 1496 const char16_t *s = str.getBuffer(); 1497 const char16_t *limit = s + str.length(); 1498 UBool numeric = settings->isNumeric(); 1499 if(settings->dontCheckFCD()) { 1500 UTF16CollationIterator iter(data, numeric, s, s, limit); 1501 int64_t ce; 1502 while((ce = iter.nextCE(errorCode)) != Collation::NO_CE) { 1503 ces.addElement(ce, errorCode); 1504 } 1505 } else { 1506 FCDUTF16CollationIterator iter(data, numeric, s, s, limit); 1507 int64_t ce; 1508 while((ce = iter.nextCE(errorCode)) != Collation::NO_CE) { 1509 ces.addElement(ce, errorCode); 1510 } 1511 } 1512 } 1513 1514 namespace { 1515 1516 void appendSubtag(CharString &s, char letter, const char *subtag, int32_t length, 1517 UErrorCode &errorCode) { 1518 if(U_FAILURE(errorCode) || length == 0) { return; } 1519 if(!s.isEmpty()) { 1520 s.append('_', errorCode); 1521 } 1522 s.append(letter, errorCode); 1523 for(int32_t i = 0; i < length; ++i) { 1524 s.append(uprv_toupper(subtag[i]), errorCode); 1525 } 1526 } 1527 1528 void appendAttribute(CharString &s, char letter, UColAttributeValue value, 1529 UErrorCode &errorCode) { 1530 if(U_FAILURE(errorCode)) { return; } 1531 if(!s.isEmpty()) { 1532 s.append('_', errorCode); 1533 } 1534 static const char *valueChars = "1234...........IXO..SN..LU......"; 1535 s.append(letter, errorCode); 1536 s.append(valueChars[value], errorCode); 1537 } 1538 1539 } // namespace 1540 1541 int32_t 1542 RuleBasedCollator::internalGetShortDefinitionString(const char *locale, 1543 char *buffer, int32_t capacity, 1544 UErrorCode &errorCode) const { 1545 if(U_FAILURE(errorCode)) { return 0; } 1546 if(buffer == nullptr ? capacity != 0 : capacity < 0) { 1547 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 1548 return 0; 1549 } 1550 if(locale == nullptr) { 1551 locale = internalGetLocaleID(ULOC_VALID_LOCALE, errorCode); 1552 } 1553 1554 char resultLocale[ULOC_FULLNAME_CAPACITY + 1]; 1555 int32_t length = ucol_getFunctionalEquivalent(resultLocale, ULOC_FULLNAME_CAPACITY, 1556 "collation", locale, 1557 nullptr, &errorCode); 1558 if(U_FAILURE(errorCode)) { return 0; } 1559 resultLocale[length] = 0; 1560 1561 // Append items in alphabetic order of their short definition letters. 1562 CharString result; 1563 1564 if(attributeHasBeenSetExplicitly(UCOL_ALTERNATE_HANDLING)) { 1565 appendAttribute(result, 'A', getAttribute(UCOL_ALTERNATE_HANDLING, errorCode), errorCode); 1566 } 1567 // ATTR_VARIABLE_TOP not supported because 'B' was broken. 1568 // See ICU tickets #10372 and #10386. 1569 if(attributeHasBeenSetExplicitly(UCOL_CASE_FIRST)) { 1570 appendAttribute(result, 'C', getAttribute(UCOL_CASE_FIRST, errorCode), errorCode); 1571 } 1572 if(attributeHasBeenSetExplicitly(UCOL_NUMERIC_COLLATION)) { 1573 appendAttribute(result, 'D', getAttribute(UCOL_NUMERIC_COLLATION, errorCode), errorCode); 1574 } 1575 if(attributeHasBeenSetExplicitly(UCOL_CASE_LEVEL)) { 1576 appendAttribute(result, 'E', getAttribute(UCOL_CASE_LEVEL, errorCode), errorCode); 1577 } 1578 if(attributeHasBeenSetExplicitly(UCOL_FRENCH_COLLATION)) { 1579 appendAttribute(result, 'F', getAttribute(UCOL_FRENCH_COLLATION, errorCode), errorCode); 1580 } 1581 // Note: UCOL_HIRAGANA_QUATERNARY_MODE is deprecated and never changes away from default. 1582 CharString collation = ulocimp_getKeywordValue(resultLocale, "collation", errorCode); 1583 appendSubtag(result, 'K', collation.data(), collation.length(), errorCode); 1584 CharString language; 1585 CharString script; 1586 CharString region; 1587 CharString variant; 1588 ulocimp_getSubtags(resultLocale, &language, &script, ®ion, &variant, nullptr, errorCode); 1589 if (language.isEmpty()) { 1590 appendSubtag(result, 'L', "root", 4, errorCode); 1591 } else { 1592 appendSubtag(result, 'L', language.data(), language.length(), errorCode); 1593 } 1594 if(attributeHasBeenSetExplicitly(UCOL_NORMALIZATION_MODE)) { 1595 appendAttribute(result, 'N', getAttribute(UCOL_NORMALIZATION_MODE, errorCode), errorCode); 1596 } 1597 appendSubtag(result, 'R', region.data(), region.length(), errorCode); 1598 if(attributeHasBeenSetExplicitly(UCOL_STRENGTH)) { 1599 appendAttribute(result, 'S', getAttribute(UCOL_STRENGTH, errorCode), errorCode); 1600 } 1601 appendSubtag(result, 'V', variant.data(), variant.length(), errorCode); 1602 appendSubtag(result, 'Z', script.data(), script.length(), errorCode); 1603 1604 if(U_FAILURE(errorCode)) { return 0; } 1605 return result.extract(buffer, capacity, errorCode); 1606 } 1607 1608 UBool 1609 RuleBasedCollator::isUnsafe(UChar32 c) const { 1610 return data->isUnsafeBackward(c, settings->isNumeric()); 1611 } 1612 1613 void U_CALLCONV 1614 RuleBasedCollator::computeMaxExpansions(const CollationTailoring *t, UErrorCode &errorCode) { 1615 t->maxExpansions = CollationElementIterator::computeMaxExpansions(t->data, errorCode); 1616 } 1617 1618 UBool 1619 RuleBasedCollator::initMaxExpansions(UErrorCode &errorCode) const { 1620 umtx_initOnce(tailoring->maxExpansionsInitOnce, computeMaxExpansions, tailoring, errorCode); 1621 return U_SUCCESS(errorCode); 1622 } 1623 1624 CollationElementIterator * 1625 RuleBasedCollator::createCollationElementIterator(const UnicodeString& source) const { 1626 UErrorCode errorCode = U_ZERO_ERROR; 1627 if(!initMaxExpansions(errorCode)) { return nullptr; } 1628 CollationElementIterator *cei = new CollationElementIterator(source, this, errorCode); 1629 if(U_FAILURE(errorCode)) { 1630 delete cei; 1631 return nullptr; 1632 } 1633 return cei; 1634 } 1635 1636 CollationElementIterator * 1637 RuleBasedCollator::createCollationElementIterator(const CharacterIterator& source) const { 1638 UErrorCode errorCode = U_ZERO_ERROR; 1639 if(!initMaxExpansions(errorCode)) { return nullptr; } 1640 CollationElementIterator *cei = new CollationElementIterator(source, this, errorCode); 1641 if(U_FAILURE(errorCode)) { 1642 delete cei; 1643 return nullptr; 1644 } 1645 return cei; 1646 } 1647 1648 int32_t 1649 RuleBasedCollator::getMaxExpansion(int32_t order) const { 1650 UErrorCode errorCode = U_ZERO_ERROR; 1651 (void)initMaxExpansions(errorCode); 1652 return CollationElementIterator::getMaxExpansion(tailoring->maxExpansions, order); 1653 } 1654 1655 U_NAMESPACE_END 1656 1657 #endif // !UCONFIG_NO_COLLATION