normalizer2.cpp (18830B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2009-2016, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: normalizer2.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2009nov22 16 * created by: Markus W. Scherer 17 */ 18 19 #include "unicode/utypes.h" 20 21 #if !UCONFIG_NO_NORMALIZATION 22 23 #include "unicode/edits.h" 24 #include "unicode/normalizer2.h" 25 #include "unicode/stringoptions.h" 26 #include "unicode/unistr.h" 27 #include "unicode/unorm.h" 28 #include "cstring.h" 29 #include "mutex.h" 30 #include "norm2allmodes.h" 31 #include "normalizer2impl.h" 32 #include "uassert.h" 33 #include "ucln_cmn.h" 34 35 using icu::Normalizer2Impl; 36 37 #if NORM2_HARDCODE_NFC_DATA 38 // NFC/NFD data machine-generated by gennorm2 --csource 39 #define INCLUDED_FROM_NORMALIZER2_CPP 40 #include "norm2_nfc_data.h" 41 #endif 42 43 U_NAMESPACE_BEGIN 44 45 // Public API dispatch via Normalizer2 subclasses -------------------------- *** 46 47 Normalizer2::~Normalizer2() {} 48 49 void 50 Normalizer2::normalizeUTF8(uint32_t /*options*/, StringPiece src, ByteSink &sink, 51 Edits *edits, UErrorCode &errorCode) const { 52 if (U_FAILURE(errorCode)) { 53 return; 54 } 55 if (edits != nullptr) { 56 errorCode = U_UNSUPPORTED_ERROR; 57 return; 58 } 59 UnicodeString src16 = UnicodeString::fromUTF8(src); 60 normalize(src16, errorCode).toUTF8(sink); 61 } 62 63 UBool 64 Normalizer2::getRawDecomposition(UChar32, UnicodeString &) const { 65 return false; 66 } 67 68 UChar32 69 Normalizer2::composePair(UChar32, UChar32) const { 70 return U_SENTINEL; 71 } 72 73 uint8_t 74 Normalizer2::getCombiningClass(UChar32 /*c*/) const { 75 return 0; 76 } 77 78 UBool 79 Normalizer2::isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const { 80 return U_SUCCESS(errorCode) && isNormalized(UnicodeString::fromUTF8(s), errorCode); 81 } 82 83 // Normalizer2 implementation for the old UNORM_NONE. 84 class NoopNormalizer2 : public Normalizer2 { 85 virtual ~NoopNormalizer2(); 86 87 virtual UnicodeString & 88 normalize(const UnicodeString &src, 89 UnicodeString &dest, 90 UErrorCode &errorCode) const override { 91 if(U_SUCCESS(errorCode)) { 92 if(&dest!=&src) { 93 dest=src; 94 } else { 95 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 96 } 97 } 98 return dest; 99 } 100 virtual void 101 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, 102 Edits *edits, UErrorCode &errorCode) const override { 103 if(U_SUCCESS(errorCode)) { 104 if (edits != nullptr) { 105 if ((options & U_EDITS_NO_RESET) == 0) { 106 edits->reset(); 107 } 108 edits->addUnchanged(src.length()); 109 } 110 if ((options & U_OMIT_UNCHANGED_TEXT) == 0) { 111 sink.Append(src.data(), src.length()); 112 } 113 sink.Flush(); 114 } 115 } 116 117 virtual UnicodeString & 118 normalizeSecondAndAppend(UnicodeString &first, 119 const UnicodeString &second, 120 UErrorCode &errorCode) const override { 121 if(U_SUCCESS(errorCode)) { 122 if(&first!=&second) { 123 first.append(second); 124 } else { 125 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 126 } 127 } 128 return first; 129 } 130 virtual UnicodeString & 131 append(UnicodeString &first, 132 const UnicodeString &second, 133 UErrorCode &errorCode) const override { 134 if(U_SUCCESS(errorCode)) { 135 if(&first!=&second) { 136 first.append(second); 137 } else { 138 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 139 } 140 } 141 return first; 142 } 143 virtual UBool 144 getDecomposition(UChar32, UnicodeString &) const override { 145 return false; 146 } 147 // No need to override the default getRawDecomposition(). 148 virtual UBool 149 isNormalized(const UnicodeString &, UErrorCode &errorCode) const override { 150 return U_SUCCESS(errorCode); 151 } 152 virtual UBool 153 isNormalizedUTF8(StringPiece, UErrorCode &errorCode) const override { 154 return U_SUCCESS(errorCode); 155 } 156 virtual UNormalizationCheckResult 157 quickCheck(const UnicodeString &, UErrorCode &) const override { 158 return UNORM_YES; 159 } 160 virtual int32_t 161 spanQuickCheckYes(const UnicodeString &s, UErrorCode &) const override { 162 return s.length(); 163 } 164 virtual UBool hasBoundaryBefore(UChar32) const override { return true; } 165 virtual UBool hasBoundaryAfter(UChar32) const override { return true; } 166 virtual UBool isInert(UChar32) const override { return true; } 167 }; 168 169 NoopNormalizer2::~NoopNormalizer2() {} 170 171 Normalizer2WithImpl::~Normalizer2WithImpl() {} 172 173 DecomposeNormalizer2::~DecomposeNormalizer2() {} 174 175 ComposeNormalizer2::~ComposeNormalizer2() {} 176 177 FCDNormalizer2::~FCDNormalizer2() {} 178 179 // instance cache ---------------------------------------------------------- *** 180 181 U_CDECL_BEGIN 182 static UBool U_CALLCONV uprv_normalizer2_cleanup(); 183 U_CDECL_END 184 185 static Normalizer2 *noopSingleton; 186 static icu::UInitOnce noopInitOnce {}; 187 188 static void U_CALLCONV initNoopSingleton(UErrorCode &errorCode) { 189 if(U_FAILURE(errorCode)) { 190 return; 191 } 192 noopSingleton=new NoopNormalizer2; 193 if(noopSingleton==nullptr) { 194 errorCode=U_MEMORY_ALLOCATION_ERROR; 195 return; 196 } 197 ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup); 198 } 199 200 const Normalizer2 *Normalizer2Factory::getNoopInstance(UErrorCode &errorCode) { 201 if(U_FAILURE(errorCode)) { return nullptr; } 202 umtx_initOnce(noopInitOnce, &initNoopSingleton, errorCode); 203 return noopSingleton; 204 } 205 206 const Normalizer2Impl * 207 Normalizer2Factory::getImpl(const Normalizer2 *norm2) { 208 return &((Normalizer2WithImpl *)norm2)->impl; 209 } 210 211 Norm2AllModes::~Norm2AllModes() { 212 delete impl; 213 } 214 215 Norm2AllModes * 216 Norm2AllModes::createInstance(Normalizer2Impl *impl, UErrorCode &errorCode) { 217 if(U_FAILURE(errorCode)) { 218 delete impl; 219 return nullptr; 220 } 221 Norm2AllModes *allModes=new Norm2AllModes(impl); 222 if(allModes==nullptr) { 223 errorCode=U_MEMORY_ALLOCATION_ERROR; 224 delete impl; 225 return nullptr; 226 } 227 return allModes; 228 } 229 230 #if NORM2_HARDCODE_NFC_DATA 231 Norm2AllModes * 232 Norm2AllModes::createNFCInstance(UErrorCode &errorCode) { 233 if(U_FAILURE(errorCode)) { 234 return nullptr; 235 } 236 Normalizer2Impl *impl=new Normalizer2Impl; 237 if(impl==nullptr) { 238 errorCode=U_MEMORY_ALLOCATION_ERROR; 239 return nullptr; 240 } 241 impl->init(norm2_nfc_data_indexes, &norm2_nfc_data_trie, 242 norm2_nfc_data_extraData, norm2_nfc_data_smallFCD); 243 return createInstance(impl, errorCode); 244 } 245 246 static Norm2AllModes *nfcSingleton; 247 248 static icu::UInitOnce nfcInitOnce {}; 249 250 static void U_CALLCONV initNFCSingleton(UErrorCode &errorCode) { 251 nfcSingleton=Norm2AllModes::createNFCInstance(errorCode); 252 ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup); 253 } 254 255 const Norm2AllModes * 256 Norm2AllModes::getNFCInstance(UErrorCode &errorCode) { 257 if(U_FAILURE(errorCode)) { return nullptr; } 258 umtx_initOnce(nfcInitOnce, &initNFCSingleton, errorCode); 259 return nfcSingleton; 260 } 261 262 const Normalizer2 * 263 Normalizer2::getNFCInstance(UErrorCode &errorCode) { 264 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode); 265 return allModes!=nullptr ? &allModes->comp : nullptr; 266 } 267 268 const Normalizer2 * 269 Normalizer2::getNFDInstance(UErrorCode &errorCode) { 270 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode); 271 return allModes!=nullptr ? &allModes->decomp : nullptr; 272 } 273 274 const Normalizer2 *Normalizer2Factory::getFCDInstance(UErrorCode &errorCode) { 275 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode); 276 return allModes!=nullptr ? &allModes->fcd : nullptr; 277 } 278 279 const Normalizer2 *Normalizer2Factory::getFCCInstance(UErrorCode &errorCode) { 280 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode); 281 return allModes!=nullptr ? &allModes->fcc : nullptr; 282 } 283 284 const Normalizer2Impl * 285 Normalizer2Factory::getNFCImpl(UErrorCode &errorCode) { 286 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode); 287 return allModes!=nullptr ? allModes->impl : nullptr; 288 } 289 #endif // NORM2_HARDCODE_NFC_DATA 290 291 U_CDECL_BEGIN 292 293 static UBool U_CALLCONV uprv_normalizer2_cleanup() { 294 delete noopSingleton; 295 noopSingleton = nullptr; 296 noopInitOnce.reset(); 297 #if NORM2_HARDCODE_NFC_DATA 298 delete nfcSingleton; 299 nfcSingleton = nullptr; 300 nfcInitOnce.reset(); 301 #endif 302 return true; 303 } 304 305 U_CDECL_END 306 307 U_NAMESPACE_END 308 309 // C API ------------------------------------------------------------------- *** 310 311 U_NAMESPACE_USE 312 313 U_CAPI const UNormalizer2 * U_EXPORT2 314 unorm2_getNFCInstance(UErrorCode *pErrorCode) { 315 return (const UNormalizer2 *)Normalizer2::getNFCInstance(*pErrorCode); 316 } 317 318 U_CAPI const UNormalizer2 * U_EXPORT2 319 unorm2_getNFDInstance(UErrorCode *pErrorCode) { 320 return (const UNormalizer2 *)Normalizer2::getNFDInstance(*pErrorCode); 321 } 322 323 U_CAPI void U_EXPORT2 324 unorm2_close(UNormalizer2 *norm2) { 325 delete (Normalizer2 *)norm2; 326 } 327 328 U_CAPI int32_t U_EXPORT2 329 unorm2_normalize(const UNormalizer2 *norm2, 330 const char16_t *src, int32_t length, 331 char16_t *dest, int32_t capacity, 332 UErrorCode *pErrorCode) { 333 if(U_FAILURE(*pErrorCode)) { 334 return 0; 335 } 336 if( (src==nullptr ? length!=0 : length<-1) || 337 (dest==nullptr ? capacity!=0 : capacity<0) || 338 (src==dest && src!=nullptr) 339 ) { 340 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 341 return 0; 342 } 343 UnicodeString destString(dest, 0, capacity); 344 // length==0: Nothing to do, and n2wi->normalize(nullptr, nullptr, buffer, ...) would crash. 345 if(length!=0) { 346 const Normalizer2 *n2=(const Normalizer2 *)norm2; 347 const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2); 348 if(n2wi!=nullptr) { 349 // Avoid duplicate argument checking and support NUL-terminated src. 350 ReorderingBuffer buffer(n2wi->impl, destString); 351 if(buffer.init(length, *pErrorCode)) { 352 n2wi->normalize(src, length>=0 ? src+length : nullptr, buffer, *pErrorCode); 353 } 354 } else { 355 UnicodeString srcString(length<0, src, length); 356 n2->normalize(srcString, destString, *pErrorCode); 357 } 358 } 359 return destString.extract(dest, capacity, *pErrorCode); 360 } 361 362 static int32_t 363 normalizeSecondAndAppend(const UNormalizer2 *norm2, 364 char16_t *first, int32_t firstLength, int32_t firstCapacity, 365 const char16_t *second, int32_t secondLength, 366 UBool doNormalize, 367 UErrorCode *pErrorCode) { 368 if(U_FAILURE(*pErrorCode)) { 369 return 0; 370 } 371 if( (second==nullptr ? secondLength!=0 : secondLength<-1) || 372 (first==nullptr ? (firstCapacity!=0 || firstLength!=0) : 373 (firstCapacity<0 || firstLength<-1)) || 374 (first==second && first!=nullptr) 375 ) { 376 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 377 return 0; 378 } 379 UnicodeString firstString(first, firstLength, firstCapacity); 380 firstLength=firstString.length(); // In case it was -1. 381 // secondLength==0: Nothing to do, and n2wi->normalizeAndAppend(nullptr, nullptr, buffer, ...) would crash. 382 if(secondLength!=0) { 383 const Normalizer2* n2 = reinterpret_cast<const Normalizer2*>(norm2); 384 const Normalizer2WithImpl* n2wi = dynamic_cast<const Normalizer2WithImpl*>(n2); 385 if(n2wi!=nullptr) { 386 // Avoid duplicate argument checking and support NUL-terminated src. 387 UnicodeString safeMiddle; 388 { 389 ReorderingBuffer buffer(n2wi->impl, firstString); 390 if(buffer.init(firstLength+secondLength+1, *pErrorCode)) { // destCapacity>=-1 391 n2wi->normalizeAndAppend(second, secondLength>=0 ? second+secondLength : nullptr, 392 doNormalize, safeMiddle, buffer, *pErrorCode); 393 } 394 } // The ReorderingBuffer destructor finalizes firstString. 395 if(U_FAILURE(*pErrorCode) || firstString.length()>firstCapacity) { 396 // Restore the modified suffix of the first string. 397 // This does not restore first[] array contents between firstLength and firstCapacity. 398 // (That might be uninitialized memory, as far as we know.) 399 if(first!=nullptr) { /* don't dereference nullptr */ 400 safeMiddle.extract(0, 0x7fffffff, first+firstLength-safeMiddle.length()); 401 if(firstLength<firstCapacity) { 402 first[firstLength]=0; // NUL-terminate in case it was originally. 403 } 404 } 405 } 406 } else { 407 UnicodeString secondString(secondLength<0, second, secondLength); 408 if(doNormalize) { 409 n2->normalizeSecondAndAppend(firstString, secondString, *pErrorCode); 410 } else { 411 n2->append(firstString, secondString, *pErrorCode); 412 } 413 } 414 } 415 return firstString.extract(first, firstCapacity, *pErrorCode); 416 } 417 418 U_CAPI int32_t U_EXPORT2 419 unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2, 420 char16_t *first, int32_t firstLength, int32_t firstCapacity, 421 const char16_t *second, int32_t secondLength, 422 UErrorCode *pErrorCode) { 423 return normalizeSecondAndAppend(norm2, 424 first, firstLength, firstCapacity, 425 second, secondLength, 426 true, pErrorCode); 427 } 428 429 U_CAPI int32_t U_EXPORT2 430 unorm2_append(const UNormalizer2 *norm2, 431 char16_t *first, int32_t firstLength, int32_t firstCapacity, 432 const char16_t *second, int32_t secondLength, 433 UErrorCode *pErrorCode) { 434 return normalizeSecondAndAppend(norm2, 435 first, firstLength, firstCapacity, 436 second, secondLength, 437 false, pErrorCode); 438 } 439 440 U_CAPI int32_t U_EXPORT2 441 unorm2_getDecomposition(const UNormalizer2 *norm2, 442 UChar32 c, char16_t *decomposition, int32_t capacity, 443 UErrorCode *pErrorCode) { 444 if(U_FAILURE(*pErrorCode)) { 445 return 0; 446 } 447 if(decomposition==nullptr ? capacity!=0 : capacity<0) { 448 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 449 return 0; 450 } 451 UnicodeString destString(decomposition, 0, capacity); 452 if(reinterpret_cast<const Normalizer2 *>(norm2)->getDecomposition(c, destString)) { 453 return destString.extract(decomposition, capacity, *pErrorCode); 454 } else { 455 return -1; 456 } 457 } 458 459 U_CAPI int32_t U_EXPORT2 460 unorm2_getRawDecomposition(const UNormalizer2 *norm2, 461 UChar32 c, char16_t *decomposition, int32_t capacity, 462 UErrorCode *pErrorCode) { 463 if(U_FAILURE(*pErrorCode)) { 464 return 0; 465 } 466 if(decomposition==nullptr ? capacity!=0 : capacity<0) { 467 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 468 return 0; 469 } 470 UnicodeString destString(decomposition, 0, capacity); 471 if(reinterpret_cast<const Normalizer2 *>(norm2)->getRawDecomposition(c, destString)) { 472 return destString.extract(decomposition, capacity, *pErrorCode); 473 } else { 474 return -1; 475 } 476 } 477 478 U_CAPI UChar32 U_EXPORT2 479 unorm2_composePair(const UNormalizer2 *norm2, UChar32 a, UChar32 b) { 480 return reinterpret_cast<const Normalizer2 *>(norm2)->composePair(a, b); 481 } 482 483 U_CAPI uint8_t U_EXPORT2 484 unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c) { 485 return reinterpret_cast<const Normalizer2 *>(norm2)->getCombiningClass(c); 486 } 487 488 U_CAPI UBool U_EXPORT2 489 unorm2_isNormalized(const UNormalizer2 *norm2, 490 const char16_t *s, int32_t length, 491 UErrorCode *pErrorCode) { 492 if(U_FAILURE(*pErrorCode)) { 493 return 0; 494 } 495 if((s==nullptr && length!=0) || length<-1) { 496 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 497 return 0; 498 } 499 UnicodeString sString(length<0, s, length); 500 return ((const Normalizer2 *)norm2)->isNormalized(sString, *pErrorCode); 501 } 502 503 U_CAPI UNormalizationCheckResult U_EXPORT2 504 unorm2_quickCheck(const UNormalizer2 *norm2, 505 const char16_t *s, int32_t length, 506 UErrorCode *pErrorCode) { 507 if(U_FAILURE(*pErrorCode)) { 508 return UNORM_NO; 509 } 510 if((s==nullptr && length!=0) || length<-1) { 511 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 512 return UNORM_NO; 513 } 514 UnicodeString sString(length<0, s, length); 515 return ((const Normalizer2 *)norm2)->quickCheck(sString, *pErrorCode); 516 } 517 518 U_CAPI int32_t U_EXPORT2 519 unorm2_spanQuickCheckYes(const UNormalizer2 *norm2, 520 const char16_t *s, int32_t length, 521 UErrorCode *pErrorCode) { 522 if(U_FAILURE(*pErrorCode)) { 523 return 0; 524 } 525 if((s==nullptr && length!=0) || length<-1) { 526 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 527 return 0; 528 } 529 UnicodeString sString(length<0, s, length); 530 return ((const Normalizer2 *)norm2)->spanQuickCheckYes(sString, *pErrorCode); 531 } 532 533 U_CAPI UBool U_EXPORT2 534 unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c) { 535 return ((const Normalizer2 *)norm2)->hasBoundaryBefore(c); 536 } 537 538 U_CAPI UBool U_EXPORT2 539 unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c) { 540 return ((const Normalizer2 *)norm2)->hasBoundaryAfter(c); 541 } 542 543 U_CAPI UBool U_EXPORT2 544 unorm2_isInert(const UNormalizer2 *norm2, UChar32 c) { 545 return ((const Normalizer2 *)norm2)->isInert(c); 546 } 547 548 // Some properties APIs ---------------------------------------------------- *** 549 550 U_CAPI uint8_t U_EXPORT2 551 u_getCombiningClass(UChar32 c) { 552 UErrorCode errorCode=U_ZERO_ERROR; 553 const Normalizer2 *nfd=Normalizer2::getNFDInstance(errorCode); 554 if(U_SUCCESS(errorCode)) { 555 return nfd->getCombiningClass(c); 556 } else { 557 return 0; 558 } 559 } 560 561 U_CFUNC uint16_t 562 unorm_getFCD16(UChar32 c) { 563 UErrorCode errorCode=U_ZERO_ERROR; 564 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode); 565 if(U_SUCCESS(errorCode)) { 566 return impl->getFCD16(c); 567 } else { 568 return 0; 569 } 570 } 571 572 #endif // !UCONFIG_NO_NORMALIZATION