alphaindex.cpp (43110B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2009-2014, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10 #include "unicode/utypes.h" 11 12 #if !UCONFIG_NO_COLLATION 13 14 #include "unicode/alphaindex.h" 15 #include "unicode/coll.h" 16 #include "unicode/localpointer.h" 17 #include "unicode/normalizer2.h" 18 #include "unicode/tblcoll.h" 19 #include "unicode/uchar.h" 20 #include "unicode/ulocdata.h" 21 #include "unicode/uniset.h" 22 #include "unicode/uobject.h" 23 #include "unicode/usetiter.h" 24 #include "unicode/utf16.h" 25 26 #include "cmemory.h" 27 #include "cstring.h" 28 #include "uassert.h" 29 #include "uvector.h" 30 #include "uvectr64.h" 31 32 //#include <string> 33 //#include <iostream> 34 35 U_NAMESPACE_BEGIN 36 37 namespace { 38 39 /** 40 * Prefix string for Chinese index buckets. 41 * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes 42 */ 43 const char16_t BASE[1] = { 0xFDD0 }; 44 const int32_t BASE_LENGTH = 1; 45 46 UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer, 47 const UnicodeString &one, const UnicodeString &other); 48 49 } // namespace 50 51 static int32_t U_CALLCONV 52 collatorComparator(const void *context, const void *left, const void *right); 53 54 static int32_t U_CALLCONV 55 recordCompareFn(const void *context, const void *left, const void *right); 56 57 // UVector<Record *> support function, delete a Record. 58 static void U_CALLCONV 59 alphaIndex_deleteRecord(void *obj) { 60 delete static_cast<AlphabeticIndex::Record *>(obj); 61 } 62 63 namespace { 64 65 UnicodeString *ownedString(const UnicodeString &s, LocalPointer<UnicodeString> &owned, 66 UErrorCode &errorCode) { 67 if (U_FAILURE(errorCode)) { return nullptr; } 68 if (owned.isValid()) { 69 return owned.orphan(); 70 } 71 UnicodeString *p = new UnicodeString(s); 72 if (p == nullptr) { 73 errorCode = U_MEMORY_ALLOCATION_ERROR; 74 } 75 return p; 76 } 77 78 inline UnicodeString *getString(const UVector &list, int32_t i) { 79 return static_cast<UnicodeString *>(list[i]); 80 } 81 82 inline AlphabeticIndex::Bucket *getBucket(const UVector &list, int32_t i) { 83 return static_cast<AlphabeticIndex::Bucket *>(list[i]); 84 } 85 86 inline AlphabeticIndex::Record *getRecord(const UVector &list, int32_t i) { 87 return static_cast<AlphabeticIndex::Record *>(list[i]); 88 } 89 90 /** 91 * Like Java Collections.binarySearch(List, String, Comparator). 92 * 93 * @return the index>=0 where the item was found, 94 * or the index<0 for inserting the string at ~index in sorted order 95 */ 96 int32_t binarySearch(const UVector &list, const UnicodeString &s, const Collator &coll) { 97 if (list.size() == 0) { return ~0; } 98 int32_t start = 0; 99 int32_t limit = list.size(); 100 for (;;) { 101 int32_t i = (start + limit) / 2; 102 const UnicodeString *si = static_cast<UnicodeString *>(list.elementAt(i)); 103 UErrorCode errorCode = U_ZERO_ERROR; 104 UCollationResult cmp = coll.compare(s, *si, errorCode); 105 if (cmp == UCOL_EQUAL) { 106 return i; 107 } else if (cmp < 0) { 108 if (i == start) { 109 return ~start; // insert s before *si 110 } 111 limit = i; 112 } else { 113 if (i == start) { 114 return ~(start + 1); // insert s after *si 115 } 116 start = i; 117 } 118 } 119 } 120 121 } // namespace 122 123 // The BucketList is not in the anonymous namespace because only Clang 124 // seems to support its use in other classes from there. 125 // However, we also don't need U_I18N_API because it is not used from outside the i18n library. 126 class BucketList : public UObject { 127 public: 128 BucketList(UVector *bucketList, UVector *publicBucketList) 129 : bucketList_(bucketList), immutableVisibleList_(publicBucketList) { 130 int32_t displayIndex = 0; 131 for (int32_t i = 0; i < publicBucketList->size(); ++i) { 132 getBucket(*publicBucketList, i)->displayIndex_ = displayIndex++; 133 } 134 } 135 136 // The virtual destructor must not be inline. 137 // See ticket #8454 for details. 138 virtual ~BucketList(); 139 140 int32_t getBucketCount() const { 141 return immutableVisibleList_->size(); 142 } 143 144 int32_t getBucketIndex(const UnicodeString &name, const Collator &collatorPrimaryOnly, 145 UErrorCode &errorCode) { 146 // binary search 147 int32_t start = 0; 148 int32_t limit = bucketList_->size(); 149 while ((start + 1) < limit) { 150 int32_t i = (start + limit) / 2; 151 const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, i); 152 UCollationResult nameVsBucket = 153 collatorPrimaryOnly.compare(name, bucket->lowerBoundary_, errorCode); 154 if (nameVsBucket < 0) { 155 limit = i; 156 } else { 157 start = i; 158 } 159 } 160 const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, start); 161 if (bucket->displayBucket_ != nullptr) { 162 bucket = bucket->displayBucket_; 163 } 164 return bucket->displayIndex_; 165 } 166 167 /** All of the buckets, visible and invisible. */ 168 UVector *bucketList_; 169 /** Just the visible buckets. */ 170 UVector *immutableVisibleList_; 171 }; 172 173 BucketList::~BucketList() { 174 delete bucketList_; 175 if (immutableVisibleList_ != bucketList_) { 176 delete immutableVisibleList_; 177 } 178 } 179 180 AlphabeticIndex::ImmutableIndex::~ImmutableIndex() { 181 delete buckets_; 182 delete collatorPrimaryOnly_; 183 } 184 185 int32_t 186 AlphabeticIndex::ImmutableIndex::getBucketCount() const { 187 return buckets_->getBucketCount(); 188 } 189 190 int32_t 191 AlphabeticIndex::ImmutableIndex::getBucketIndex( 192 const UnicodeString &name, UErrorCode &errorCode) const { 193 return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, errorCode); 194 } 195 196 const AlphabeticIndex::Bucket * 197 AlphabeticIndex::ImmutableIndex::getBucket(int32_t index) const { 198 if (0 <= index && index < buckets_->getBucketCount()) { 199 return icu::getBucket(*buckets_->immutableVisibleList_, index); 200 } else { 201 return nullptr; 202 } 203 } 204 205 AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status) 206 : inputList_(nullptr), 207 labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(nullptr), 208 maxLabelCount_(99), 209 initialLabels_(nullptr), firstCharsInScripts_(nullptr), 210 collator_(nullptr), collatorPrimaryOnly_(nullptr), 211 buckets_(nullptr) { 212 init(&locale, status); 213 } 214 215 216 AlphabeticIndex::AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status) 217 : inputList_(nullptr), 218 labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(nullptr), 219 maxLabelCount_(99), 220 initialLabels_(nullptr), firstCharsInScripts_(nullptr), 221 collator_(collator), collatorPrimaryOnly_(nullptr), 222 buckets_(nullptr) { 223 init(nullptr, status); 224 } 225 226 227 228 AlphabeticIndex::~AlphabeticIndex() { 229 delete collator_; 230 delete collatorPrimaryOnly_; 231 delete firstCharsInScripts_; 232 delete buckets_; 233 delete inputList_; 234 delete initialLabels_; 235 } 236 237 238 AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) { 239 if (U_FAILURE(status)) { 240 return *this; 241 } 242 initialLabels_->addAll(additions); 243 clearBuckets(); 244 return *this; 245 } 246 247 248 AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) { 249 addIndexExemplars(locale, status); 250 clearBuckets(); 251 return *this; 252 } 253 254 255 AlphabeticIndex::ImmutableIndex *AlphabeticIndex::buildImmutableIndex(UErrorCode &errorCode) { 256 if (U_FAILURE(errorCode)) { return nullptr; } 257 // In C++, the ImmutableIndex must own its copy of the BucketList, 258 // even if it contains no records, for proper memory management. 259 // We could clone the buckets_ if they are not nullptr, 260 // but that would be worth it only if this method is called multiple times, 261 // or called after using the old-style bucket iterator API. 262 LocalPointer<BucketList> immutableBucketList(createBucketList(errorCode)); 263 LocalPointer<RuleBasedCollator> coll(collatorPrimaryOnly_->clone()); 264 if (immutableBucketList.isNull() || coll.isNull()) { 265 errorCode = U_MEMORY_ALLOCATION_ERROR; 266 return nullptr; 267 } 268 ImmutableIndex *immIndex = new ImmutableIndex(immutableBucketList.getAlias(), coll.getAlias()); 269 if (immIndex == nullptr) { 270 errorCode = U_MEMORY_ALLOCATION_ERROR; 271 return nullptr; 272 } 273 // The ImmutableIndex adopted its parameter objects. 274 immutableBucketList.orphan(); 275 coll.orphan(); 276 return immIndex; 277 } 278 279 int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) { 280 initBuckets(status); 281 if (U_FAILURE(status)) { 282 return 0; 283 } 284 return buckets_->getBucketCount(); 285 } 286 287 288 int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) { 289 if (U_FAILURE(status) || inputList_ == nullptr) { 290 return 0; 291 } 292 return inputList_->size(); 293 } 294 295 void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const { 296 U_ASSERT(indexCharacters.hasDeleter()); 297 const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode); 298 if (U_FAILURE(errorCode)) { return; } 299 300 const UnicodeString &firstScriptBoundary = *getString(*firstCharsInScripts_, 0); 301 const UnicodeString &overflowBoundary = 302 *getString(*firstCharsInScripts_, firstCharsInScripts_->size() - 1); 303 304 // We make a sorted array of elements. 305 // Some of the input may be redundant. 306 // That is, we might have c, ch, d, where "ch" sorts just like "c", "h". 307 // We filter out those cases. 308 UnicodeSetIterator iter(*initialLabels_); 309 while (U_SUCCESS(errorCode) && iter.next()) { 310 const UnicodeString *item = &iter.getString(); 311 LocalPointer<UnicodeString> ownedItem; 312 UBool checkDistinct; 313 int32_t itemLength = item->length(); 314 if (!item->hasMoreChar32Than(0, itemLength, 1)) { 315 checkDistinct = false; 316 } else if(item->charAt(itemLength - 1) == 0x2a && // '*' 317 item->charAt(itemLength - 2) != 0x2a) { 318 // Use a label if it is marked with one trailing star, 319 // even if the label string sorts the same when all contractions are suppressed. 320 ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1)); 321 item = ownedItem.getAlias(); 322 if (item == nullptr) { 323 errorCode = U_MEMORY_ALLOCATION_ERROR; 324 return; 325 } 326 checkDistinct = false; 327 } else { 328 checkDistinct = true; 329 } 330 if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) { 331 // Ignore a primary-ignorable or non-alphabetic index character. 332 } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) { 333 // Ignore an index character that will land in the overflow bucket. 334 } else if (checkDistinct && 335 collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) { 336 // Ignore a multi-code point index character that does not sort distinctly 337 // from the sequence of its separate characters. 338 } else { 339 int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_); 340 if (insertionPoint < 0) { 341 indexCharacters.insertElementAt( 342 ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode); 343 } else { 344 const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint); 345 if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) { 346 indexCharacters.setElementAt( 347 ownedString(*item, ownedItem, errorCode), insertionPoint); 348 } 349 } 350 } 351 } 352 if (U_FAILURE(errorCode)) { return; } 353 354 // if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element 355 356 int32_t size = indexCharacters.size() - 1; 357 if (size > maxLabelCount_) { 358 int32_t count = 0; 359 int32_t old = -1; 360 for (int32_t i = 0; i < indexCharacters.size();) { 361 ++count; 362 int32_t bump = count * maxLabelCount_ / size; 363 if (bump == old) { 364 indexCharacters.removeElementAt(i); 365 } else { 366 old = bump; 367 ++i; 368 } 369 } 370 } 371 } 372 373 namespace { 374 375 const UnicodeString &fixLabel(const UnicodeString ¤t, UnicodeString &temp) { 376 if (!current.startsWith(BASE, BASE_LENGTH)) { 377 return current; 378 } 379 char16_t rest = current.charAt(BASE_LENGTH); 380 if (0x2800 < rest && rest <= 0x28FF) { // stroke count 381 int32_t count = rest-0x2800; 382 temp.setTo(static_cast<char16_t>(0x30 + count % 10)); 383 if (count >= 10) { 384 count /= 10; 385 temp.insert(0, static_cast<char16_t>(0x30 + count % 10)); 386 if (count >= 10) { 387 count /= 10; 388 temp.insert(0, static_cast<char16_t>(0x30 + count)); 389 } 390 } 391 return temp.append(static_cast<char16_t>(0x5283)); 392 } 393 return temp.setTo(current, BASE_LENGTH); 394 } 395 396 UBool hasMultiplePrimaryWeights( 397 const RuleBasedCollator &coll, uint32_t variableTop, 398 const UnicodeString &s, UVector64 &ces, UErrorCode &errorCode) { 399 ces.removeAllElements(); 400 coll.internalGetCEs(s, ces, errorCode); 401 if (U_FAILURE(errorCode)) { return false; } 402 UBool seenPrimary = false; 403 for (int32_t i = 0; i < ces.size(); ++i) { 404 int64_t ce = ces.elementAti(i); 405 uint32_t p = static_cast<uint32_t>(ce >> 32); 406 if (p > variableTop) { 407 // not primary ignorable 408 if (seenPrimary) { 409 return true; 410 } 411 seenPrimary = true; 412 } 413 } 414 return false; 415 } 416 417 } // namespace 418 419 BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const { 420 // Initialize indexCharacters. 421 UVector indexCharacters(errorCode); 422 indexCharacters.setDeleter(uprv_deleteUObject); 423 initLabels(indexCharacters, errorCode); 424 if (U_FAILURE(errorCode)) { return nullptr; } 425 426 // Variables for hasMultiplePrimaryWeights(). 427 UVector64 ces(errorCode); 428 uint32_t variableTop; 429 if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) == UCOL_SHIFTED) { 430 variableTop = collatorPrimaryOnly_->getVariableTop(errorCode); 431 } else { 432 variableTop = 0; 433 } 434 UBool hasInvisibleBuckets = false; 435 436 // Helper arrays for Chinese Pinyin collation. 437 Bucket *asciiBuckets[26] = { 438 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 439 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr 440 }; 441 Bucket *pinyinBuckets[26] = { 442 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 443 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr 444 }; 445 UBool hasPinyin = false; 446 447 LocalPointer<UVector> bucketList(new UVector(errorCode), errorCode); 448 if (U_FAILURE(errorCode)) { 449 return nullptr; 450 } 451 bucketList->setDeleter(uprv_deleteUObject); 452 453 // underflow bucket 454 LocalPointer<Bucket> bucket(new Bucket(getUnderflowLabel(), emptyString_, U_ALPHAINDEX_UNDERFLOW), errorCode); 455 if (U_FAILURE(errorCode)) { 456 return nullptr; 457 } 458 bucketList->adoptElement(bucket.orphan(), errorCode); 459 if (U_FAILURE(errorCode)) { return nullptr; } 460 461 UnicodeString temp; 462 463 // fix up the list, adding underflow, additions, overflow 464 // Insert inflow labels as needed. 465 int32_t scriptIndex = -1; 466 const UnicodeString *scriptUpperBoundary = &emptyString_; 467 for (int32_t i = 0; i < indexCharacters.size(); ++i) { 468 UnicodeString ¤t = *getString(indexCharacters, i); 469 if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) >= 0) { 470 // We crossed the script boundary into a new script. 471 const UnicodeString &inflowBoundary = *scriptUpperBoundary; 472 UBool skippedScript = false; 473 for (;;) { 474 scriptUpperBoundary = getString(*firstCharsInScripts_, ++scriptIndex); 475 if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) < 0) { 476 break; 477 } 478 skippedScript = true; 479 } 480 if (skippedScript && bucketList->size() > 1) { 481 // We are skipping one or more scripts, 482 // and we are not just getting out of the underflow label. 483 bucket.adoptInsteadAndCheckErrorCode( 484 new Bucket(getInflowLabel(), inflowBoundary, U_ALPHAINDEX_INFLOW), errorCode); 485 bucketList->adoptElement(bucket.orphan(), errorCode); 486 if (U_FAILURE(errorCode)) { return nullptr; } 487 } 488 } 489 // Add a bucket with the current label. 490 bucket.adoptInsteadAndCheckErrorCode( 491 new Bucket(fixLabel(current, temp), current, U_ALPHAINDEX_NORMAL), errorCode); 492 bucketList->adoptElement(bucket.orphan(), errorCode); 493 if (U_FAILURE(errorCode)) { return nullptr; } 494 // Remember ASCII and Pinyin buckets for Pinyin redirects. 495 char16_t c; 496 if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5A) { // A-Z 497 asciiBuckets[c - 0x41] = static_cast<Bucket*>(bucketList->lastElement()); 498 } else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BASE, BASE_LENGTH) && 499 0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) { 500 pinyinBuckets[c - 0x41] = static_cast<Bucket*>(bucketList->lastElement()); 501 hasPinyin = true; 502 } 503 // Check for multiple primary weights. 504 if (!current.startsWith(BASE, BASE_LENGTH) && 505 hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop, current, 506 ces, errorCode) && 507 current.charAt(current.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) { 508 // "AE-ligature" or "Sch" etc. 509 for (int32_t j = bucketList->size() - 2;; --j) { 510 Bucket *singleBucket = getBucket(*bucketList, j); 511 if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) { 512 // There is no single-character bucket since the last 513 // underflow or inflow label. 514 break; 515 } 516 if (singleBucket->displayBucket_ == nullptr && 517 !hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop, 518 singleBucket->lowerBoundary_, 519 ces, errorCode)) { 520 // Add an invisible bucket that redirects strings greater than the expansion 521 // to the previous single-character bucket. 522 // For example, after ... Q R S Sch we add Sch\uFFFF->S 523 // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S. 524 bucket.adoptInsteadAndCheckErrorCode(new Bucket(emptyString_, 525 UnicodeString(current).append(static_cast<char16_t>(0xFFFF)), 526 U_ALPHAINDEX_NORMAL), 527 errorCode); 528 if (U_FAILURE(errorCode)) { 529 return nullptr; 530 } 531 bucket->displayBucket_ = singleBucket; 532 bucketList->adoptElement(bucket.orphan(), errorCode); 533 if (U_FAILURE(errorCode)) { return nullptr; } 534 hasInvisibleBuckets = true; 535 break; 536 } 537 } 538 } 539 } 540 if (U_FAILURE(errorCode)) { return nullptr; } 541 if (bucketList->size() == 1) { 542 // No real labels, show only the underflow label. 543 BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias()); 544 if (bl == nullptr) { 545 errorCode = U_MEMORY_ALLOCATION_ERROR; 546 return nullptr; 547 } 548 bucketList.orphan(); 549 return bl; 550 } 551 // overflow bucket 552 bucket.adoptInsteadAndCheckErrorCode( 553 new Bucket(getOverflowLabel(), *scriptUpperBoundary, U_ALPHAINDEX_OVERFLOW), errorCode); 554 bucketList->adoptElement(bucket.orphan(), errorCode); // final 555 if (U_FAILURE(errorCode)) { return nullptr; } 556 557 if (hasPinyin) { 558 // Redirect Pinyin buckets. 559 Bucket *asciiBucket = nullptr; 560 for (int32_t i = 0; i < 26; ++i) { 561 if (asciiBuckets[i] != nullptr) { 562 asciiBucket = asciiBuckets[i]; 563 } 564 if (pinyinBuckets[i] != nullptr && asciiBucket != nullptr) { 565 pinyinBuckets[i]->displayBucket_ = asciiBucket; 566 hasInvisibleBuckets = true; 567 } 568 } 569 } 570 571 if (U_FAILURE(errorCode)) { return nullptr; } 572 if (!hasInvisibleBuckets) { 573 BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias()); 574 if (bl == nullptr) { 575 errorCode = U_MEMORY_ALLOCATION_ERROR; 576 return nullptr; 577 } 578 bucketList.orphan(); 579 return bl; 580 } 581 // Merge inflow buckets that are visually adjacent. 582 // Iterate backwards: Merge inflow into overflow rather than the other way around. 583 int32_t i = bucketList->size() - 1; 584 Bucket *nextBucket = getBucket(*bucketList, i); 585 while (--i > 0) { 586 Bucket *bucket = getBucket(*bucketList, i); 587 if (bucket->displayBucket_ != nullptr) { 588 continue; // skip invisible buckets 589 } 590 if (bucket->labelType_ == U_ALPHAINDEX_INFLOW) { 591 if (nextBucket->labelType_ != U_ALPHAINDEX_NORMAL) { 592 bucket->displayBucket_ = nextBucket; 593 continue; 594 } 595 } 596 nextBucket = bucket; 597 } 598 599 LocalPointer<UVector> publicBucketList(new UVector(errorCode), errorCode); 600 if (U_FAILURE(errorCode)) { 601 return nullptr; 602 } 603 // Do not call publicBucketList->setDeleter(): 604 // This vector shares its objects with the bucketList. 605 for (int32_t j = 0; j < bucketList->size(); ++j) { 606 Bucket *bucket = getBucket(*bucketList, j); 607 if (bucket->displayBucket_ == nullptr) { 608 publicBucketList->addElement(bucket, errorCode); 609 } 610 } 611 if (U_FAILURE(errorCode)) { return nullptr; } 612 BucketList *bl = new BucketList(bucketList.getAlias(), publicBucketList.getAlias()); 613 if (bl == nullptr) { 614 errorCode = U_MEMORY_ALLOCATION_ERROR; 615 return nullptr; 616 } 617 bucketList.orphan(); 618 publicBucketList.orphan(); 619 return bl; 620 } 621 622 /** 623 * Creates an index, and buckets and sorts the list of records into the index. 624 */ 625 void AlphabeticIndex::initBuckets(UErrorCode &errorCode) { 626 if (U_FAILURE(errorCode) || buckets_ != nullptr) { 627 return; 628 } 629 buckets_ = createBucketList(errorCode); 630 if (U_FAILURE(errorCode) || inputList_ == nullptr || inputList_->isEmpty()) { 631 return; 632 } 633 634 // Sort the records by name. 635 // Stable sort preserves input order of collation duplicates. 636 inputList_->sortWithUComparator(recordCompareFn, collator_, errorCode); 637 638 // Now, we traverse all of the input, which is now sorted. 639 // If the item doesn't go in the current bucket, we find the next bucket that contains it. 640 // This makes the process order n*log(n), since we just sort the list and then do a linear process. 641 // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so 642 // we need to improve it for that case. 643 644 Bucket *currentBucket = getBucket(*buckets_->bucketList_, 0); 645 int32_t bucketIndex = 1; 646 Bucket *nextBucket; 647 const UnicodeString *upperBoundary; 648 if (bucketIndex < buckets_->bucketList_->size()) { 649 nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++); 650 upperBoundary = &nextBucket->lowerBoundary_; 651 } else { 652 nextBucket = nullptr; 653 upperBoundary = nullptr; 654 } 655 for (int32_t i = 0; i < inputList_->size(); ++i) { 656 Record *r = getRecord(*inputList_, i); 657 // if the current bucket isn't the right one, find the one that is 658 // We have a special flag for the last bucket so that we don't look any further 659 while (upperBoundary != nullptr && 660 collatorPrimaryOnly_->compare(r->name_, *upperBoundary, errorCode) >= 0) { 661 currentBucket = nextBucket; 662 // now reset the boundary that we compare against 663 if (bucketIndex < buckets_->bucketList_->size()) { 664 nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++); 665 upperBoundary = &nextBucket->lowerBoundary_; 666 } else { 667 upperBoundary = nullptr; 668 } 669 } 670 // now put the record into the bucket. 671 Bucket *bucket = currentBucket; 672 if (bucket->displayBucket_ != nullptr) { 673 bucket = bucket->displayBucket_; 674 } 675 if (bucket->records_ == nullptr) { 676 LocalPointer<UVector> records(new UVector(errorCode), errorCode); 677 if (U_FAILURE(errorCode)) { 678 return; 679 } 680 bucket->records_ = records.orphan(); 681 } 682 bucket->records_->addElement(r, errorCode); 683 } 684 } 685 686 void AlphabeticIndex::clearBuckets() { 687 if (buckets_ != nullptr) { 688 delete buckets_; 689 buckets_ = nullptr; 690 internalResetBucketIterator(); 691 } 692 } 693 694 void AlphabeticIndex::internalResetBucketIterator() { 695 labelsIterIndex_ = -1; 696 currentBucket_ = nullptr; 697 } 698 699 700 void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) { 701 LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status)); 702 if (U_FAILURE(status)) { 703 return; 704 } 705 706 UnicodeSet exemplars; 707 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status); 708 if (U_SUCCESS(status)) { 709 initialLabels_->addAll(exemplars); 710 return; 711 } 712 status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR 713 714 // The locale data did not include explicit Index characters. 715 // Synthesize a set of them from the locale's standard exemplar characters. 716 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status); 717 if (U_FAILURE(status)) { 718 return; 719 } 720 721 // question: should we add auxiliary exemplars? 722 if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.isEmpty()) { 723 exemplars.add(0x61, 0x7A); 724 } 725 if (exemplars.containsSome(0xAC00, 0xD7A3)) { // Hangul syllables 726 // cut down to small list 727 exemplars.remove(0xAC00, 0xD7A3). 728 add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C). 729 add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544). 730 add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0). 731 add(0xD30C).add(0xD558); 732 } 733 if (exemplars.containsSome(0x1200, 0x137F)) { // Ethiopic block 734 // cut down to small list 735 // make use of the fact that Ethiopic is allocated in 8's, where 736 // the base is 0 mod 8. 737 UnicodeSet ethiopic(UnicodeString(u"[ሀለሐመሠረሰሸቀቈቐቘበቨተቸኀኈነኘአከኰኸዀወዐዘዠየደዸጀገጐጘጠጨጰጸፀፈፐፘ]"), status); 738 ethiopic.retainAll(exemplars); 739 exemplars.remove(u'ሀ', 0x137F).addAll(ethiopic); 740 } 741 742 // Upper-case any that aren't already so. 743 // (We only do this for synthesized index characters.) 744 UnicodeSetIterator it(exemplars); 745 UnicodeString upperC; 746 while (it.next()) { 747 const UnicodeString &exemplarC = it.getString(); 748 upperC = exemplarC; 749 upperC.toUpper(locale); 750 initialLabels_->add(upperC); 751 } 752 } 753 754 UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) { 755 UnicodeSet contractions; 756 collatorPrimaryOnly_->internalAddContractions(BASE[0], contractions, errorCode); 757 if (U_FAILURE(errorCode) || contractions.isEmpty()) { return false; } 758 initialLabels_->addAll(contractions); 759 UnicodeSetIterator iter(contractions); 760 while (iter.next()) { 761 const UnicodeString &s = iter.getString(); 762 U_ASSERT (s.startsWith(BASE, BASE_LENGTH)); 763 char16_t c = s.charAt(s.length() - 1); 764 if (0x41 <= c && c <= 0x5A) { // A-Z 765 // There are Pinyin labels, add ASCII A-Z labels as well. 766 initialLabels_->add(0x41, 0x5A); // A-Z 767 break; 768 } 769 } 770 return true; 771 } 772 773 774 /* 775 * Return the string with interspersed CGJs. Input must have more than 2 codepoints. 776 */ 777 static const char16_t CGJ = 0x034F; 778 UnicodeString AlphabeticIndex::separated(const UnicodeString &item) { 779 UnicodeString result; 780 if (item.length() == 0) { 781 return result; 782 } 783 int32_t i = 0; 784 for (;;) { 785 UChar32 cp = item.char32At(i); 786 result.append(cp); 787 i = item.moveIndex32(i, 1); 788 if (i >= item.length()) { 789 break; 790 } 791 result.append(CGJ); 792 } 793 return result; 794 } 795 796 797 bool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const { 798 return false; 799 } 800 801 802 bool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const { 803 return false; 804 } 805 806 807 const RuleBasedCollator &AlphabeticIndex::getCollator() const { 808 return *collator_; 809 } 810 811 812 const UnicodeString &AlphabeticIndex::getInflowLabel() const { 813 return inflowLabel_; 814 } 815 816 const UnicodeString &AlphabeticIndex::getOverflowLabel() const { 817 return overflowLabel_; 818 } 819 820 821 const UnicodeString &AlphabeticIndex::getUnderflowLabel() const { 822 return underflowLabel_; 823 } 824 825 826 AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { 827 inflowLabel_ = label; 828 clearBuckets(); 829 return *this; 830 } 831 832 833 AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { 834 overflowLabel_ = label; 835 clearBuckets(); 836 return *this; 837 } 838 839 840 AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { 841 underflowLabel_ = label; 842 clearBuckets(); 843 return *this; 844 } 845 846 847 int32_t AlphabeticIndex::getMaxLabelCount() const { 848 return maxLabelCount_; 849 } 850 851 852 AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) { 853 if (U_FAILURE(status)) { 854 return *this; 855 } 856 if (maxLabelCount <= 0) { 857 status = U_ILLEGAL_ARGUMENT_ERROR; 858 return *this; 859 } 860 maxLabelCount_ = maxLabelCount; 861 clearBuckets(); 862 return *this; 863 } 864 865 866 // 867 // init() - Common code for constructors. 868 // 869 870 void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) { 871 if (U_FAILURE(status)) { return; } 872 if (locale == nullptr && collator_ == nullptr) { 873 status = U_ILLEGAL_ARGUMENT_ERROR; 874 return; 875 } 876 877 initialLabels_ = new UnicodeSet(); 878 if (initialLabels_ == nullptr) { 879 status = U_MEMORY_ALLOCATION_ERROR; 880 return; 881 } 882 883 inflowLabel_.setTo(static_cast<char16_t>(0x2026)); // Ellipsis 884 overflowLabel_ = inflowLabel_; 885 underflowLabel_ = inflowLabel_; 886 887 if (collator_ == nullptr) { 888 Collator *coll = Collator::createInstance(*locale, status); 889 if (U_FAILURE(status)) { 890 delete coll; 891 return; 892 } 893 if (coll == nullptr) { 894 status = U_MEMORY_ALLOCATION_ERROR; 895 return; 896 } 897 collator_ = dynamic_cast<RuleBasedCollator *>(coll); 898 if (collator_ == nullptr) { 899 delete coll; 900 status = U_UNSUPPORTED_ERROR; 901 return; 902 } 903 } 904 collatorPrimaryOnly_ = collator_->clone(); 905 if (collatorPrimaryOnly_ == nullptr) { 906 status = U_MEMORY_ALLOCATION_ERROR; 907 return; 908 } 909 collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status); 910 firstCharsInScripts_ = firstStringsInScript(status); 911 if (U_FAILURE(status)) { return; } 912 firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status); 913 // Guard against a degenerate collator where 914 // some script boundary strings are primary ignorable. 915 for (;;) { 916 if (U_FAILURE(status)) { return; } 917 if (firstCharsInScripts_->isEmpty()) { 918 // AlphabeticIndex requires some non-ignorable script boundary strings. 919 status = U_ILLEGAL_ARGUMENT_ERROR; 920 return; 921 } 922 if (collatorPrimaryOnly_->compare( 923 *static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0)), 924 emptyString_, status) == UCOL_EQUAL) { 925 firstCharsInScripts_->removeElementAt(0); 926 } else { 927 break; 928 } 929 } 930 931 // Chinese index characters, which are specific to each of the several Chinese tailorings, 932 // take precedence over the single locale data exemplar set per language. 933 if (!addChineseIndexCharacters(status) && locale != nullptr) { 934 addIndexExemplars(*locale, status); 935 } 936 } 937 938 939 // 940 // Comparison function for UVector<UnicodeString *> sorting with a collator. 941 // 942 static int32_t U_CALLCONV 943 collatorComparator(const void *context, const void *left, const void *right) { 944 const UElement *leftElement = static_cast<const UElement *>(left); 945 const UElement *rightElement = static_cast<const UElement *>(right); 946 const UnicodeString *leftString = static_cast<const UnicodeString *>(leftElement->pointer); 947 const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer); 948 949 if (leftString == rightString) { 950 // Catches case where both are nullptr 951 return 0; 952 } 953 if (leftString == nullptr) { 954 return 1; 955 } 956 if (rightString == nullptr) { 957 return -1; 958 } 959 const Collator *col = static_cast<const Collator *>(context); 960 UErrorCode errorCode = U_ZERO_ERROR; 961 return col->compare(*leftString, *rightString, errorCode); 962 } 963 964 // 965 // Comparison function for UVector<Record *> sorting with a collator. 966 // 967 static int32_t U_CALLCONV 968 recordCompareFn(const void *context, const void *left, const void *right) { 969 const UElement *leftElement = static_cast<const UElement *>(left); 970 const UElement *rightElement = static_cast<const UElement *>(right); 971 const AlphabeticIndex::Record *leftRec = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer); 972 const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer); 973 const Collator *col = static_cast<const Collator *>(context); 974 UErrorCode errorCode = U_ZERO_ERROR; 975 return col->compare(leftRec->name_, rightRec->name_, errorCode); 976 } 977 978 UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) { 979 if (U_FAILURE(status)) { 980 return nullptr; 981 } 982 LocalPointer<UVector> dest(new UVector(status), status); 983 if (U_FAILURE(status)) { 984 return nullptr; 985 } 986 dest->setDeleter(uprv_deleteUObject); 987 // Fetch the script-first-primary contractions which are defined in the root collator. 988 // They all start with U+FDD1. 989 UnicodeSet set; 990 collatorPrimaryOnly_->internalAddContractions(0xFDD1, set, status); 991 if (U_FAILURE(status)) { 992 return nullptr; 993 } 994 if (set.isEmpty()) { 995 status = U_UNSUPPORTED_ERROR; 996 return nullptr; 997 } 998 UnicodeSetIterator iter(set); 999 while (iter.next()) { 1000 const UnicodeString &boundary = iter.getString(); 1001 uint32_t gcMask = U_GET_GC_MASK(boundary.char32At(1)); 1002 if ((gcMask & (U_GC_L_MASK | U_GC_CN_MASK)) == 0) { 1003 // Ignore boundaries for the special reordering groups. 1004 // Take only those for "real scripts" (where the sample character is a Letter, 1005 // and the one for unassigned implicit weights (Cn). 1006 continue; 1007 } 1008 LocalPointer<UnicodeString> s(new UnicodeString(boundary), status); 1009 dest->adoptElement(s.orphan(), status); 1010 if (U_FAILURE(status)) { 1011 return nullptr; 1012 } 1013 } 1014 return dest.orphan(); 1015 } 1016 1017 1018 namespace { 1019 1020 /** 1021 * Returns true if one index character string is "better" than the other. 1022 * Shorter NFKD is better, and otherwise NFKD-binary-less-than is 1023 * better, and otherwise binary-less-than is better. 1024 */ 1025 UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer, 1026 const UnicodeString &one, const UnicodeString &other) { 1027 // This is called with primary-equal strings, but never with one.equals(other). 1028 UErrorCode status = U_ZERO_ERROR; 1029 UnicodeString n1 = nfkdNormalizer.normalize(one, status); 1030 UnicodeString n2 = nfkdNormalizer.normalize(other, status); 1031 if (U_FAILURE(status)) { return false; } 1032 int32_t result = n1.countChar32() - n2.countChar32(); 1033 if (result != 0) { 1034 return result < 0; 1035 } 1036 result = n1.compareCodePointOrder(n2); 1037 if (result != 0) { 1038 return result < 0; 1039 } 1040 return one.compareCodePointOrder(other) < 0; 1041 } 1042 1043 } // namespace 1044 1045 // 1046 // Constructor & Destructor for AlphabeticIndex::Record 1047 // 1048 // Records are internal only, instances are not directly surfaced in the public API. 1049 // This class is mostly struct-like, with all public fields. 1050 1051 AlphabeticIndex::Record::Record(const UnicodeString &name, const void *data) 1052 : name_(name), data_(data) {} 1053 1054 AlphabeticIndex::Record::~Record() { 1055 } 1056 1057 1058 AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) { 1059 if (U_FAILURE(status)) { 1060 return *this; 1061 } 1062 if (inputList_ == nullptr) { 1063 LocalPointer<UVector> inputList(new UVector(status), status); 1064 if (U_FAILURE(status)) { 1065 return *this; 1066 } 1067 inputList_ = inputList.orphan(); 1068 inputList_->setDeleter(alphaIndex_deleteRecord); 1069 } 1070 LocalPointer<Record> r(new Record(name, data), status); 1071 inputList_->adoptElement(r.orphan(), status); 1072 if (U_FAILURE(status)) { 1073 return *this; 1074 } 1075 clearBuckets(); 1076 //std::string ss; 1077 //std::string ss2; 1078 //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" << 1079 // " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl; 1080 return *this; 1081 } 1082 1083 1084 AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) { 1085 if (U_SUCCESS(status) && inputList_ != nullptr && !inputList_->isEmpty()) { 1086 inputList_->removeAllElements(); 1087 clearBuckets(); 1088 } 1089 return *this; 1090 } 1091 1092 int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) { 1093 initBuckets(status); 1094 if (U_FAILURE(status)) { 1095 return 0; 1096 } 1097 return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, status); 1098 } 1099 1100 1101 int32_t AlphabeticIndex::getBucketIndex() const { 1102 return labelsIterIndex_; 1103 } 1104 1105 1106 UBool AlphabeticIndex::nextBucket(UErrorCode &status) { 1107 if (U_FAILURE(status)) { 1108 return false; 1109 } 1110 if (buckets_ == nullptr && currentBucket_ != nullptr) { 1111 status = U_ENUM_OUT_OF_SYNC_ERROR; 1112 return false; 1113 } 1114 initBuckets(status); 1115 if (U_FAILURE(status)) { 1116 return false; 1117 } 1118 ++labelsIterIndex_; 1119 if (labelsIterIndex_ >= buckets_->getBucketCount()) { 1120 labelsIterIndex_ = buckets_->getBucketCount(); 1121 return false; 1122 } 1123 currentBucket_ = getBucket(*buckets_->immutableVisibleList_, labelsIterIndex_); 1124 resetRecordIterator(); 1125 return true; 1126 } 1127 1128 const UnicodeString &AlphabeticIndex::getBucketLabel() const { 1129 if (currentBucket_ != nullptr) { 1130 return currentBucket_->label_; 1131 } else { 1132 return emptyString_; 1133 } 1134 } 1135 1136 1137 UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const { 1138 if (currentBucket_ != nullptr) { 1139 return currentBucket_->labelType_; 1140 } else { 1141 return U_ALPHAINDEX_NORMAL; 1142 } 1143 } 1144 1145 1146 int32_t AlphabeticIndex::getBucketRecordCount() const { 1147 if (currentBucket_ != nullptr && currentBucket_->records_ != nullptr) { 1148 return currentBucket_->records_->size(); 1149 } else { 1150 return 0; 1151 } 1152 } 1153 1154 1155 AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) { 1156 if (U_FAILURE(status)) { 1157 return *this; 1158 } 1159 internalResetBucketIterator(); 1160 return *this; 1161 } 1162 1163 1164 UBool AlphabeticIndex::nextRecord(UErrorCode &status) { 1165 if (U_FAILURE(status)) { 1166 return false; 1167 } 1168 if (currentBucket_ == nullptr) { 1169 // We are trying to iterate over the items in a bucket, but there is no 1170 // current bucket from the enumeration of buckets. 1171 status = U_INVALID_STATE_ERROR; 1172 return false; 1173 } 1174 if (buckets_ == nullptr) { 1175 status = U_ENUM_OUT_OF_SYNC_ERROR; 1176 return false; 1177 } 1178 if (currentBucket_->records_ == nullptr) { 1179 return false; 1180 } 1181 ++itemsIterIndex_; 1182 if (itemsIterIndex_ >= currentBucket_->records_->size()) { 1183 itemsIterIndex_ = currentBucket_->records_->size(); 1184 return false; 1185 } 1186 return true; 1187 } 1188 1189 1190 const UnicodeString &AlphabeticIndex::getRecordName() const { 1191 const UnicodeString *retStr = &emptyString_; 1192 if (currentBucket_ != nullptr && currentBucket_->records_ != nullptr && 1193 itemsIterIndex_ >= 0 && 1194 itemsIterIndex_ < currentBucket_->records_->size()) { 1195 Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_)); 1196 retStr = &item->name_; 1197 } 1198 return *retStr; 1199 } 1200 1201 const void *AlphabeticIndex::getRecordData() const { 1202 const void *retPtr = nullptr; 1203 if (currentBucket_ != nullptr && currentBucket_->records_ != nullptr && 1204 itemsIterIndex_ >= 0 && 1205 itemsIterIndex_ < currentBucket_->records_->size()) { 1206 Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_)); 1207 retPtr = item->data_; 1208 } 1209 return retPtr; 1210 } 1211 1212 1213 AlphabeticIndex & AlphabeticIndex::resetRecordIterator() { 1214 itemsIterIndex_ = -1; 1215 return *this; 1216 } 1217 1218 1219 1220 AlphabeticIndex::Bucket::Bucket(const UnicodeString &label, 1221 const UnicodeString &lowerBoundary, 1222 UAlphabeticIndexLabelType type) 1223 : label_(label), lowerBoundary_(lowerBoundary), labelType_(type), 1224 displayBucket_(nullptr), displayIndex_(-1), 1225 records_(nullptr) { 1226 } 1227 1228 1229 AlphabeticIndex::Bucket::~Bucket() { 1230 delete records_; 1231 } 1232 1233 U_NAMESPACE_END 1234 1235 #endif // !UCONFIG_NO_COLLATION