normlzr.cpp (15145B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ************************************************************************* 5 * COPYRIGHT: 6 * Copyright (c) 1996-2012, International Business Machines Corporation and 7 * others. All Rights Reserved. 8 ************************************************************************* 9 */ 10 11 #include "unicode/utypes.h" 12 13 #if !UCONFIG_NO_NORMALIZATION 14 15 #include "unicode/uniset.h" 16 #include "unicode/unistr.h" 17 #include "unicode/chariter.h" 18 #include "unicode/schriter.h" 19 #include "unicode/uchriter.h" 20 #include "unicode/normlzr.h" 21 #include "unicode/utf16.h" 22 #include "cmemory.h" 23 #include "normalizer2impl.h" 24 #include "uprops.h" // for uniset_getUnicode32Instance() 25 26 #if defined(move32) 27 // System can define move32 intrinsics, but the char iters define move32 method 28 // using same undef trick in headers, so undef here to re-enable the method. 29 #undef move32 30 #endif 31 32 U_NAMESPACE_BEGIN 33 34 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer) 35 36 //------------------------------------------------------------------------- 37 // Constructors and other boilerplate 38 //------------------------------------------------------------------------- 39 40 Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) : 41 UObject(), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(mode), fOptions(0), 42 text(new StringCharacterIterator(str)), 43 currentIndex(0), nextIndex(0), 44 buffer(), bufferPos(0) 45 { 46 init(); 47 } 48 49 Normalizer::Normalizer(ConstChar16Ptr str, int32_t length, UNormalizationMode mode) : 50 UObject(), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(mode), fOptions(0), 51 text(new UCharCharacterIterator(str, length)), 52 currentIndex(0), nextIndex(0), 53 buffer(), bufferPos(0) 54 { 55 init(); 56 } 57 58 Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) : 59 UObject(), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(mode), fOptions(0), 60 text(iter.clone()), 61 currentIndex(0), nextIndex(0), 62 buffer(), bufferPos(0) 63 { 64 init(); 65 } 66 67 Normalizer::Normalizer(const Normalizer ©) : 68 UObject(copy), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(copy.fUMode), fOptions(copy.fOptions), 69 text(copy.text->clone()), 70 currentIndex(copy.currentIndex), nextIndex(copy.nextIndex), 71 buffer(copy.buffer), bufferPos(copy.bufferPos) 72 { 73 init(); 74 } 75 76 void 77 Normalizer::init() { 78 UErrorCode errorCode=U_ZERO_ERROR; 79 fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode); 80 if(fOptions&UNORM_UNICODE_3_2) { 81 delete fFilteredNorm2; 82 fNorm2=fFilteredNorm2= 83 new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode)); 84 } 85 if(U_FAILURE(errorCode)) { 86 errorCode=U_ZERO_ERROR; 87 fNorm2=Normalizer2Factory::getNoopInstance(errorCode); 88 } 89 } 90 91 Normalizer::~Normalizer() 92 { 93 delete fFilteredNorm2; 94 delete text; 95 } 96 97 Normalizer* 98 Normalizer::clone() const 99 { 100 return new Normalizer(*this); 101 } 102 103 /** 104 * Generates a hash code for this iterator. 105 */ 106 int32_t Normalizer::hashCode() const 107 { 108 return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex; 109 } 110 111 bool Normalizer::operator==(const Normalizer& that) const 112 { 113 return 114 this==&that || 115 (fUMode==that.fUMode && 116 fOptions==that.fOptions && 117 *text==*that.text && 118 buffer==that.buffer && 119 bufferPos==that.bufferPos && 120 nextIndex==that.nextIndex); 121 } 122 123 //------------------------------------------------------------------------- 124 // Static utility methods 125 //------------------------------------------------------------------------- 126 127 void U_EXPORT2 128 Normalizer::normalize(const UnicodeString& source, 129 UNormalizationMode mode, int32_t options, 130 UnicodeString& result, 131 UErrorCode &status) { 132 if(source.isBogus() || U_FAILURE(status)) { 133 result.setToBogus(); 134 if(U_SUCCESS(status)) { 135 status=U_ILLEGAL_ARGUMENT_ERROR; 136 } 137 } else { 138 UnicodeString localDest; 139 UnicodeString *dest; 140 141 if(&source!=&result) { 142 dest=&result; 143 } else { 144 // the source and result strings are the same object, use a temporary one 145 dest=&localDest; 146 } 147 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); 148 if(U_SUCCESS(status)) { 149 if(options&UNORM_UNICODE_3_2) { 150 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). 151 normalize(source, *dest, status); 152 } else { 153 n2->normalize(source, *dest, status); 154 } 155 } 156 if(dest==&localDest && U_SUCCESS(status)) { 157 result=*dest; 158 } 159 } 160 } 161 162 void U_EXPORT2 163 Normalizer::compose(const UnicodeString& source, 164 UBool compat, int32_t options, 165 UnicodeString& result, 166 UErrorCode &status) { 167 normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status); 168 } 169 170 void U_EXPORT2 171 Normalizer::decompose(const UnicodeString& source, 172 UBool compat, int32_t options, 173 UnicodeString& result, 174 UErrorCode &status) { 175 normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status); 176 } 177 178 UNormalizationCheckResult 179 Normalizer::quickCheck(const UnicodeString& source, 180 UNormalizationMode mode, int32_t options, 181 UErrorCode &status) { 182 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); 183 if(U_SUCCESS(status)) { 184 if(options&UNORM_UNICODE_3_2) { 185 return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). 186 quickCheck(source, status); 187 } else { 188 return n2->quickCheck(source, status); 189 } 190 } else { 191 return UNORM_MAYBE; 192 } 193 } 194 195 UBool 196 Normalizer::isNormalized(const UnicodeString& source, 197 UNormalizationMode mode, int32_t options, 198 UErrorCode &status) { 199 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); 200 if(U_SUCCESS(status)) { 201 if(options&UNORM_UNICODE_3_2) { 202 return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). 203 isNormalized(source, status); 204 } else { 205 return n2->isNormalized(source, status); 206 } 207 } else { 208 return false; 209 } 210 } 211 212 UnicodeString & U_EXPORT2 213 Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right, 214 UnicodeString &result, 215 UNormalizationMode mode, int32_t options, 216 UErrorCode &errorCode) { 217 if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) { 218 result.setToBogus(); 219 if(U_SUCCESS(errorCode)) { 220 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 221 } 222 } else { 223 UnicodeString localDest; 224 UnicodeString *dest; 225 226 if(&right!=&result) { 227 dest=&result; 228 } else { 229 // the right and result strings are the same object, use a temporary one 230 dest=&localDest; 231 } 232 *dest=left; 233 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode); 234 if(U_SUCCESS(errorCode)) { 235 if(options&UNORM_UNICODE_3_2) { 236 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)). 237 append(*dest, right, errorCode); 238 } else { 239 n2->append(*dest, right, errorCode); 240 } 241 } 242 if(dest==&localDest && U_SUCCESS(errorCode)) { 243 result=*dest; 244 } 245 } 246 return result; 247 } 248 249 //------------------------------------------------------------------------- 250 // Iteration API 251 //------------------------------------------------------------------------- 252 253 /** 254 * Return the current character in the normalized text. 255 */ 256 UChar32 Normalizer::current() { 257 if(bufferPos<buffer.length() || nextNormalize()) { 258 return buffer.char32At(bufferPos); 259 } else { 260 return DONE; 261 } 262 } 263 264 /** 265 * Return the next character in the normalized text and advance 266 * the iteration position by one. If the end 267 * of the text has already been reached, {@link #DONE} is returned. 268 */ 269 UChar32 Normalizer::next() { 270 if(bufferPos<buffer.length() || nextNormalize()) { 271 UChar32 c=buffer.char32At(bufferPos); 272 bufferPos+=U16_LENGTH(c); 273 return c; 274 } else { 275 return DONE; 276 } 277 } 278 279 /** 280 * Return the previous character in the normalized text and decrement 281 * the iteration position by one. If the beginning 282 * of the text has already been reached, {@link #DONE} is returned. 283 */ 284 UChar32 Normalizer::previous() { 285 if(bufferPos>0 || previousNormalize()) { 286 UChar32 c=buffer.char32At(bufferPos-1); 287 bufferPos-=U16_LENGTH(c); 288 return c; 289 } else { 290 return DONE; 291 } 292 } 293 294 void Normalizer::reset() { 295 currentIndex=nextIndex=text->setToStart(); 296 clearBuffer(); 297 } 298 299 void 300 Normalizer::setIndexOnly(int32_t index) { 301 text->setIndex(index); // pins index 302 currentIndex=nextIndex=text->getIndex(); 303 clearBuffer(); 304 } 305 306 /** 307 * Return the first character in the normalized text. This resets 308 * the <tt>Normalizer's</tt> position to the beginning of the text. 309 */ 310 UChar32 Normalizer::first() { 311 reset(); 312 return next(); 313 } 314 315 /** 316 * Return the last character in the normalized text. This resets 317 * the <tt>Normalizer's</tt> position to be just before the 318 * the input text corresponding to that normalized character. 319 */ 320 UChar32 Normalizer::last() { 321 currentIndex=nextIndex=text->setToEnd(); 322 clearBuffer(); 323 return previous(); 324 } 325 326 /** 327 * Retrieve the current iteration position in the input text that is 328 * being normalized. This method is useful in applications such as 329 * searching, where you need to be able to determine the position in 330 * the input text that corresponds to a given normalized output character. 331 * <p> 332 * <b>Note:</b> This method sets the position in the <em>input</em>, while 333 * {@link #next} and {@link #previous} iterate through characters in the 334 * <em>output</em>. This means that there is not necessarily a one-to-one 335 * correspondence between characters returned by <tt>next</tt> and 336 * <tt>previous</tt> and the indices passed to and returned from 337 * <tt>setIndex</tt> and {@link #getIndex}. 338 * 339 */ 340 int32_t Normalizer::getIndex() const { 341 if(bufferPos<buffer.length()) { 342 return currentIndex; 343 } else { 344 return nextIndex; 345 } 346 } 347 348 /** 349 * Retrieve the index of the start of the input text. This is the begin index 350 * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt> 351 * over which this <tt>Normalizer</tt> is iterating 352 */ 353 int32_t Normalizer::startIndex() const { 354 return text->startIndex(); 355 } 356 357 /** 358 * Retrieve the index of the end of the input text. This is the end index 359 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt> 360 * over which this <tt>Normalizer</tt> is iterating 361 */ 362 int32_t Normalizer::endIndex() const { 363 return text->endIndex(); 364 } 365 366 //------------------------------------------------------------------------- 367 // Property access methods 368 //------------------------------------------------------------------------- 369 370 void 371 Normalizer::setMode(UNormalizationMode newMode) 372 { 373 fUMode = newMode; 374 init(); 375 } 376 377 UNormalizationMode 378 Normalizer::getUMode() const 379 { 380 return fUMode; 381 } 382 383 void 384 Normalizer::setOption(int32_t option, 385 UBool value) 386 { 387 if (value) { 388 fOptions |= option; 389 } else { 390 fOptions &= (~option); 391 } 392 init(); 393 } 394 395 UBool 396 Normalizer::getOption(int32_t option) const 397 { 398 return (fOptions & option) != 0; 399 } 400 401 /** 402 * Set the input text over which this <tt>Normalizer</tt> will iterate. 403 * The iteration position is set to the beginning of the input text. 404 */ 405 void 406 Normalizer::setText(const UnicodeString& newText, 407 UErrorCode &status) 408 { 409 if (U_FAILURE(status)) { 410 return; 411 } 412 CharacterIterator *newIter = new StringCharacterIterator(newText); 413 if (newIter == nullptr) { 414 status = U_MEMORY_ALLOCATION_ERROR; 415 return; 416 } 417 delete text; 418 text = newIter; 419 reset(); 420 } 421 422 /** 423 * Set the input text over which this <tt>Normalizer</tt> will iterate. 424 * The iteration position is set to the beginning of the string. 425 */ 426 void 427 Normalizer::setText(const CharacterIterator& newText, 428 UErrorCode &status) 429 { 430 if (U_FAILURE(status)) { 431 return; 432 } 433 CharacterIterator *newIter = newText.clone(); 434 if (newIter == nullptr) { 435 status = U_MEMORY_ALLOCATION_ERROR; 436 return; 437 } 438 delete text; 439 text = newIter; 440 reset(); 441 } 442 443 void 444 Normalizer::setText(ConstChar16Ptr newText, 445 int32_t length, 446 UErrorCode &status) 447 { 448 if (U_FAILURE(status)) { 449 return; 450 } 451 CharacterIterator *newIter = new UCharCharacterIterator(newText, length); 452 if (newIter == nullptr) { 453 status = U_MEMORY_ALLOCATION_ERROR; 454 return; 455 } 456 delete text; 457 text = newIter; 458 reset(); 459 } 460 461 /** 462 * Copies the text under iteration into the UnicodeString referred to by "result". 463 * @param result Receives a copy of the text under iteration. 464 */ 465 void 466 Normalizer::getText(UnicodeString& result) 467 { 468 text->getText(result); 469 } 470 471 //------------------------------------------------------------------------- 472 // Private utility methods 473 //------------------------------------------------------------------------- 474 475 void Normalizer::clearBuffer() { 476 buffer.remove(); 477 bufferPos=0; 478 } 479 480 UBool 481 Normalizer::nextNormalize() { 482 clearBuffer(); 483 currentIndex=nextIndex; 484 text->setIndex(nextIndex); 485 if(!text->hasNext()) { 486 return false; 487 } 488 // Skip at least one character so we make progress. 489 UnicodeString segment(text->next32PostInc()); 490 while(text->hasNext()) { 491 UChar32 c; 492 if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) { 493 text->move32(-1, CharacterIterator::kCurrent); 494 break; 495 } 496 segment.append(c); 497 } 498 nextIndex=text->getIndex(); 499 UErrorCode errorCode=U_ZERO_ERROR; 500 fNorm2->normalize(segment, buffer, errorCode); 501 return U_SUCCESS(errorCode) && !buffer.isEmpty(); 502 } 503 504 UBool 505 Normalizer::previousNormalize() { 506 clearBuffer(); 507 nextIndex=currentIndex; 508 text->setIndex(currentIndex); 509 if(!text->hasPrevious()) { 510 return false; 511 } 512 UnicodeString segment; 513 while(text->hasPrevious()) { 514 UChar32 c=text->previous32(); 515 segment.insert(0, c); 516 if(fNorm2->hasBoundaryBefore(c)) { 517 break; 518 } 519 } 520 currentIndex=text->getIndex(); 521 UErrorCode errorCode=U_ZERO_ERROR; 522 fNorm2->normalize(segment, buffer, errorCode); 523 bufferPos=buffer.length(); 524 return U_SUCCESS(errorCode) && !buffer.isEmpty(); 525 } 526 527 U_NAMESPACE_END 528 529 #endif /* #if !UCONFIG_NO_NORMALIZATION */