coleitr.cpp (15790B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 1996-2014, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10 /* 11 * File coleitr.cpp 12 * 13 * Created by: Helena Shih 14 * 15 * Modification History: 16 * 17 * Date Name Description 18 * 19 * 6/23/97 helena Adding comments to make code more readable. 20 * 08/03/98 erm Synched with 1.2 version of CollationElementIterator.java 21 * 12/10/99 aliu Ported Thai collation support from Java. 22 * 01/25/01 swquek Modified to a C++ wrapper calling C APIs (ucoliter.h) 23 * 02/19/01 swquek Removed CollationElementIterator() since it is 24 * private constructor and no calls are made to it 25 * 2012-2014 markus Rewritten in C++ again. 26 */ 27 28 #include "unicode/utypes.h" 29 30 #if !UCONFIG_NO_COLLATION 31 32 #include "unicode/chariter.h" 33 #include "unicode/coleitr.h" 34 #include "unicode/tblcoll.h" 35 #include "unicode/ustring.h" 36 #include "cmemory.h" 37 #include "collation.h" 38 #include "collationdata.h" 39 #include "collationiterator.h" 40 #include "collationsets.h" 41 #include "collationtailoring.h" 42 #include "uassert.h" 43 #include "uhash.h" 44 #include "utf16collationiterator.h" 45 #include "uvectr32.h" 46 47 /* Constants --------------------------------------------------------------- */ 48 49 U_NAMESPACE_BEGIN 50 51 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator) 52 53 /* CollationElementIterator public constructor/destructor ------------------ */ 54 55 CollationElementIterator::CollationElementIterator( 56 const CollationElementIterator& other) 57 : UObject(other), iter_(nullptr), rbc_(nullptr), otherHalf_(0), dir_(0), offsets_(nullptr) { 58 *this = other; 59 } 60 61 CollationElementIterator::~CollationElementIterator() 62 { 63 delete iter_; 64 delete offsets_; 65 } 66 67 /* CollationElementIterator public methods --------------------------------- */ 68 69 namespace { 70 71 uint32_t getFirstHalf(uint32_t p, uint32_t lower32) { 72 return (p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xff); 73 } 74 uint32_t getSecondHalf(uint32_t p, uint32_t lower32) { 75 return (p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f); 76 } 77 UBool ceNeedsTwoParts(int64_t ce) { 78 return (ce & INT64_C(0xffff00ff003f)) != 0; 79 } 80 81 } // namespace 82 83 int32_t CollationElementIterator::getOffset() const 84 { 85 if (dir_ < 0 && offsets_ != nullptr && !offsets_->isEmpty()) { 86 // CollationIterator::previousCE() decrements the CEs length 87 // while it pops CEs from its internal buffer. 88 int32_t i = iter_->getCEsLength(); 89 if (otherHalf_ != 0) { 90 // Return the trailing CE offset while we are in the middle of a 64-bit CE. 91 ++i; 92 } 93 U_ASSERT(i < offsets_->size()); 94 return offsets_->elementAti(i); 95 } 96 return iter_->getOffset(); 97 } 98 99 /** 100 * Get the ordering priority of the next character in the string. 101 * @return the next character's ordering. Returns NULLORDER if an error has 102 * occurred or if the end of string has been reached 103 */ 104 int32_t CollationElementIterator::next(UErrorCode& status) 105 { 106 if (U_FAILURE(status)) { return NULLORDER; } 107 if (dir_ > 1) { 108 // Continue forward iteration. Test this first. 109 if (otherHalf_ != 0) { 110 uint32_t oh = otherHalf_; 111 otherHalf_ = 0; 112 return oh; 113 } 114 } else if (dir_ == 1) { 115 // next() after setOffset() 116 dir_ = 2; 117 } else if (dir_ == 0) { 118 // The iter_ is already reset to the start of the text. 119 dir_ = 2; 120 } else /* dir_ < 0 */ { 121 // illegal change of direction 122 status = U_INVALID_STATE_ERROR; 123 return NULLORDER; 124 } 125 // No need to keep all CEs in the buffer when we iterate. 126 iter_->clearCEsIfNoneRemaining(); 127 int64_t ce = iter_->nextCE(status); 128 if (ce == Collation::NO_CE) { return NULLORDER; } 129 // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits. 130 uint32_t p = static_cast<uint32_t>(ce >> 32); 131 uint32_t lower32 = static_cast<uint32_t>(ce); 132 uint32_t firstHalf = getFirstHalf(p, lower32); 133 uint32_t secondHalf = getSecondHalf(p, lower32); 134 if (secondHalf != 0) { 135 otherHalf_ = secondHalf | 0xc0; // continuation CE 136 } 137 return firstHalf; 138 } 139 140 bool CollationElementIterator::operator!=( 141 const CollationElementIterator& other) const 142 { 143 return !(*this == other); 144 } 145 146 bool CollationElementIterator::operator==( 147 const CollationElementIterator& that) const 148 { 149 if (this == &that) { 150 return true; 151 } 152 153 return 154 (rbc_ == that.rbc_ || *rbc_ == *that.rbc_) && 155 otherHalf_ == that.otherHalf_ && 156 normalizeDir() == that.normalizeDir() && 157 string_ == that.string_ && 158 *iter_ == *that.iter_; 159 } 160 161 /** 162 * Get the ordering priority of the previous collation element in the string. 163 * @param status the error code status. 164 * @return the previous element's ordering. Returns NULLORDER if an error has 165 * occurred or if the start of string has been reached. 166 */ 167 int32_t CollationElementIterator::previous(UErrorCode& status) 168 { 169 if (U_FAILURE(status)) { return NULLORDER; } 170 if (dir_ < 0) { 171 // Continue backwards iteration. Test this first. 172 if (otherHalf_ != 0) { 173 uint32_t oh = otherHalf_; 174 otherHalf_ = 0; 175 return oh; 176 } 177 } else if (dir_ == 0) { 178 iter_->resetToOffset(string_.length()); 179 dir_ = -1; 180 } else if (dir_ == 1) { 181 // previous() after setOffset() 182 dir_ = -1; 183 } else /* dir_ > 1 */ { 184 // illegal change of direction 185 status = U_INVALID_STATE_ERROR; 186 return NULLORDER; 187 } 188 if (offsets_ == nullptr) { 189 offsets_ = new UVector32(status); 190 if (offsets_ == nullptr) { 191 status = U_MEMORY_ALLOCATION_ERROR; 192 return NULLORDER; 193 } 194 } 195 // If we already have expansion CEs, then we also have offsets. 196 // Otherwise remember the trailing offset in case we need to 197 // write offsets for an artificial expansion. 198 int32_t limitOffset = iter_->getCEsLength() == 0 ? iter_->getOffset() : 0; 199 int64_t ce = iter_->previousCE(*offsets_, status); 200 if (ce == Collation::NO_CE) { return NULLORDER; } 201 // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits. 202 uint32_t p = static_cast<uint32_t>(ce >> 32); 203 uint32_t lower32 = static_cast<uint32_t>(ce); 204 uint32_t firstHalf = getFirstHalf(p, lower32); 205 uint32_t secondHalf = getSecondHalf(p, lower32); 206 if (secondHalf != 0) { 207 if (offsets_->isEmpty()) { 208 // When we convert a single 64-bit CE into two 32-bit CEs, 209 // we need to make this artificial expansion behave like a normal expansion. 210 // See CollationIterator::previousCE(). 211 offsets_->addElement(iter_->getOffset(), status); 212 offsets_->addElement(limitOffset, status); 213 } 214 otherHalf_ = firstHalf; 215 return secondHalf | 0xc0; // continuation CE 216 } 217 return firstHalf; 218 } 219 220 /** 221 * Resets the cursor to the beginning of the string. 222 */ 223 void CollationElementIterator::reset() 224 { 225 iter_ ->resetToOffset(0); 226 otherHalf_ = 0; 227 dir_ = 0; 228 } 229 230 void CollationElementIterator::setOffset(int32_t newOffset, 231 UErrorCode& status) 232 { 233 if (U_FAILURE(status)) { return; } 234 if (0 < newOffset && newOffset < string_.length()) { 235 int32_t offset = newOffset; 236 do { 237 char16_t c = string_.charAt(offset); 238 if (!rbc_->isUnsafe(c) || 239 (U16_IS_LEAD(c) && !rbc_->isUnsafe(string_.char32At(offset)))) { 240 break; 241 } 242 // Back up to before this unsafe character. 243 --offset; 244 } while (offset > 0); 245 if (offset < newOffset) { 246 // We might have backed up more than necessary. 247 // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe, 248 // but for text "chu" setOffset(2) should remain at 2 249 // although we initially back up to offset 0. 250 // Find the last safe offset no greater than newOffset by iterating forward. 251 int32_t lastSafeOffset = offset; 252 do { 253 iter_->resetToOffset(lastSafeOffset); 254 do { 255 iter_->nextCE(status); 256 if (U_FAILURE(status)) { return; } 257 } while ((offset = iter_->getOffset()) == lastSafeOffset); 258 if (offset <= newOffset) { 259 lastSafeOffset = offset; 260 } 261 } while (offset < newOffset); 262 newOffset = lastSafeOffset; 263 } 264 } 265 iter_->resetToOffset(newOffset); 266 otherHalf_ = 0; 267 dir_ = 1; 268 } 269 270 /** 271 * Sets the source to the new source string. 272 */ 273 void CollationElementIterator::setText(const UnicodeString& source, 274 UErrorCode& status) 275 { 276 if (U_FAILURE(status)) { 277 return; 278 } 279 280 string_ = source; 281 const char16_t *s = string_.getBuffer(); 282 CollationIterator *newIter; 283 UBool numeric = rbc_->settings->isNumeric(); 284 if (rbc_->settings->dontCheckFCD()) { 285 newIter = new UTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length()); 286 } else { 287 newIter = new FCDUTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length()); 288 } 289 if (newIter == nullptr) { 290 status = U_MEMORY_ALLOCATION_ERROR; 291 return; 292 } 293 delete iter_; 294 iter_ = newIter; 295 otherHalf_ = 0; 296 dir_ = 0; 297 } 298 299 // Sets the source to the new character iterator. 300 void CollationElementIterator::setText(CharacterIterator& source, 301 UErrorCode& status) 302 { 303 if (U_FAILURE(status)) 304 return; 305 306 source.getText(string_); 307 setText(string_, status); 308 } 309 310 int32_t CollationElementIterator::strengthOrder(int32_t order) const 311 { 312 UColAttributeValue s = static_cast<UColAttributeValue>(rbc_->settings->getStrength()); 313 // Mask off the unwanted differences. 314 if (s == UCOL_PRIMARY) { 315 order &= 0xffff0000; 316 } 317 else if (s == UCOL_SECONDARY) { 318 order &= 0xffffff00; 319 } 320 321 return order; 322 } 323 324 /* CollationElementIterator private constructors/destructors --------------- */ 325 326 /** 327 * This is the "real" constructor for this class; it constructs an iterator 328 * over the source text using the specified collator 329 */ 330 CollationElementIterator::CollationElementIterator( 331 const UnicodeString &source, 332 const RuleBasedCollator *coll, 333 UErrorCode &status) 334 : iter_(nullptr), rbc_(coll), otherHalf_(0), dir_(0), offsets_(nullptr) { 335 setText(source, status); 336 } 337 338 /** 339 * This is the "real" constructor for this class; it constructs an iterator over 340 * the source text using the specified collator 341 */ 342 CollationElementIterator::CollationElementIterator( 343 const CharacterIterator &source, 344 const RuleBasedCollator *coll, 345 UErrorCode &status) 346 : iter_(nullptr), rbc_(coll), otherHalf_(0), dir_(0), offsets_(nullptr) { 347 // We only call source.getText() which should be const anyway. 348 setText(const_cast<CharacterIterator &>(source), status); 349 } 350 351 /* CollationElementIterator private methods -------------------------------- */ 352 353 const CollationElementIterator& CollationElementIterator::operator=( 354 const CollationElementIterator& other) 355 { 356 if (this == &other) { 357 return *this; 358 } 359 360 CollationIterator *newIter; 361 const FCDUTF16CollationIterator *otherFCDIter = 362 dynamic_cast<const FCDUTF16CollationIterator *>(other.iter_); 363 if(otherFCDIter != nullptr) { 364 newIter = new FCDUTF16CollationIterator(*otherFCDIter, string_.getBuffer()); 365 } else { 366 const UTF16CollationIterator *otherIter = 367 dynamic_cast<const UTF16CollationIterator *>(other.iter_); 368 if(otherIter != nullptr) { 369 newIter = new UTF16CollationIterator(*otherIter, string_.getBuffer()); 370 } else { 371 newIter = nullptr; 372 } 373 } 374 if(newIter != nullptr) { 375 delete iter_; 376 iter_ = newIter; 377 rbc_ = other.rbc_; 378 otherHalf_ = other.otherHalf_; 379 dir_ = other.dir_; 380 381 string_ = other.string_; 382 } 383 if(other.dir_ < 0 && other.offsets_ != nullptr && !other.offsets_->isEmpty()) { 384 UErrorCode errorCode = U_ZERO_ERROR; 385 if(offsets_ == nullptr) { 386 offsets_ = new UVector32(other.offsets_->size(), errorCode); 387 } 388 if(offsets_ != nullptr) { 389 offsets_->assign(*other.offsets_, errorCode); 390 } 391 } 392 return *this; 393 } 394 395 namespace { 396 397 class MaxExpSink : public ContractionsAndExpansions::CESink { 398 public: 399 MaxExpSink(UHashtable *h, UErrorCode &ec) : maxExpansions(h), errorCode(ec) {} 400 virtual ~MaxExpSink(); 401 virtual void handleCE(int64_t /*ce*/) override {} 402 virtual void handleExpansion(const int64_t ces[], int32_t length) override { 403 if (length <= 1) { 404 // We do not need to add single CEs into the map. 405 return; 406 } 407 int32_t count = 0; // number of CE "halves" 408 for (int32_t i = 0; i < length; ++i) { 409 count += ceNeedsTwoParts(ces[i]) ? 2 : 1; 410 } 411 // last "half" of the last CE 412 int64_t ce = ces[length - 1]; 413 uint32_t p = static_cast<uint32_t>(ce >> 32); 414 uint32_t lower32 = static_cast<uint32_t>(ce); 415 uint32_t lastHalf = getSecondHalf(p, lower32); 416 if (lastHalf == 0) { 417 lastHalf = getFirstHalf(p, lower32); 418 U_ASSERT(lastHalf != 0); 419 } else { 420 lastHalf |= 0xc0; // old-style continuation CE 421 } 422 if (count > uhash_igeti(maxExpansions, static_cast<int32_t>(lastHalf))) { 423 uhash_iputi(maxExpansions, static_cast<int32_t>(lastHalf), count, &errorCode); 424 } 425 } 426 427 private: 428 UHashtable *maxExpansions; 429 UErrorCode &errorCode; 430 }; 431 432 MaxExpSink::~MaxExpSink() {} 433 434 } // namespace 435 436 UHashtable * 437 CollationElementIterator::computeMaxExpansions(const CollationData *data, UErrorCode &errorCode) { 438 if (U_FAILURE(errorCode)) { return nullptr; } 439 UHashtable *maxExpansions = uhash_open(uhash_hashLong, uhash_compareLong, 440 uhash_compareLong, &errorCode); 441 if (U_FAILURE(errorCode)) { return nullptr; } 442 MaxExpSink sink(maxExpansions, errorCode); 443 ContractionsAndExpansions(nullptr, nullptr, &sink, true).forData(data, errorCode); 444 if (U_FAILURE(errorCode)) { 445 uhash_close(maxExpansions); 446 return nullptr; 447 } 448 return maxExpansions; 449 } 450 451 int32_t 452 CollationElementIterator::getMaxExpansion(int32_t order) const { 453 return getMaxExpansion(rbc_->tailoring->maxExpansions, order); 454 } 455 456 int32_t 457 CollationElementIterator::getMaxExpansion(const UHashtable *maxExpansions, int32_t order) { 458 if (order == 0) { return 1; } 459 int32_t max; 460 if(maxExpansions != nullptr && (max = uhash_igeti(maxExpansions, order)) != 0) { 461 return max; 462 } 463 if ((order & 0xc0) == 0xc0) { 464 // old-style continuation CE 465 return 2; 466 } else { 467 return 1; 468 } 469 } 470 471 U_NAMESPACE_END 472 473 #endif /* #if !UCONFIG_NO_COLLATION */