normalizer2impl.cpp (111045B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2009-2014, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: normalizer2impl.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2009nov22 16 * created by: Markus W. Scherer 17 */ 18 19 // #define UCPTRIE_DEBUG 20 21 #include "unicode/utypes.h" 22 23 #if !UCONFIG_NO_NORMALIZATION 24 25 #include "unicode/bytestream.h" 26 #include "unicode/edits.h" 27 #include "unicode/normalizer2.h" 28 #include "unicode/stringoptions.h" 29 #include "unicode/ucptrie.h" 30 #include "unicode/udata.h" 31 #include "unicode/umutablecptrie.h" 32 #include "unicode/ustring.h" 33 #include "unicode/utf16.h" 34 #include "unicode/utf8.h" 35 #include "bytesinkutil.h" 36 #include "cmemory.h" 37 #include "mutex.h" 38 #include "normalizer2impl.h" 39 #include "putilimp.h" 40 #include "uassert.h" 41 #include "ucptrie_impl.h" 42 #include "uset_imp.h" 43 #include "uvector.h" 44 45 U_NAMESPACE_BEGIN 46 47 namespace { 48 49 /** 50 * UTF-8 lead byte for minNoMaybeCP. 51 * Can be lower than the actual lead byte for c. 52 * Typically U+0300 for NFC/NFD, U+00A0 for NFKC/NFKD, U+0041 for NFKC_Casefold. 53 */ 54 inline uint8_t leadByteForCP(UChar32 c) { 55 if (c <= 0x7f) { 56 return static_cast<uint8_t>(c); 57 } else if (c <= 0x7ff) { 58 return static_cast<uint8_t>(0xc0 + (c >> 6)); 59 } else { 60 // Should not occur because ccc(U+0300)!=0. 61 return 0xe0; 62 } 63 } 64 65 /** 66 * Returns the code point from one single well-formed UTF-8 byte sequence 67 * between cpStart and cpLimit. 68 * 69 * Trie UTF-8 macros do not assemble whole code points (for efficiency). 70 * When we do need the code point, we call this function. 71 * We should not need it for normalization-inert data (norm16==0). 72 * Illegal sequences yield the error value norm16==0 just like real normalization-inert code points. 73 */ 74 UChar32 codePointFromValidUTF8(const uint8_t *cpStart, const uint8_t *cpLimit) { 75 // Similar to U8_NEXT_UNSAFE(s, i, c). 76 U_ASSERT(cpStart < cpLimit); 77 uint8_t c = *cpStart; 78 switch(cpLimit-cpStart) { 79 case 1: 80 return c; 81 case 2: 82 return ((c&0x1f)<<6) | (cpStart[1]&0x3f); 83 case 3: 84 // no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (char16_t) 85 return static_cast<char16_t>((c << 12) | ((cpStart[1] & 0x3f) << 6) | (cpStart[2] & 0x3f)); 86 case 4: 87 return ((c&7)<<18) | ((cpStart[1]&0x3f)<<12) | ((cpStart[2]&0x3f)<<6) | (cpStart[3]&0x3f); 88 default: 89 UPRV_UNREACHABLE_EXIT; // Should not occur. 90 } 91 } 92 93 /** 94 * Returns the last code point in [start, p[ if it is valid and in U+1000..U+D7FF. 95 * Otherwise returns a negative value. 96 */ 97 UChar32 previousHangulOrJamo(const uint8_t *start, const uint8_t *p) { 98 if ((p - start) >= 3) { 99 p -= 3; 100 uint8_t l = *p; 101 uint8_t t1, t2; 102 if (0xe1 <= l && l <= 0xed && 103 (t1 = static_cast<uint8_t>(p[1] - 0x80)) <= 0x3f && 104 (t2 = static_cast<uint8_t>(p[2] - 0x80)) <= 0x3f && 105 (l < 0xed || t1 <= 0x1f)) { 106 return ((l & 0xf) << 12) | (t1 << 6) | t2; 107 } 108 } 109 return U_SENTINEL; 110 } 111 112 /** 113 * Returns the offset from the Jamo T base if [src, limit[ starts with a single Jamo T code point. 114 * Otherwise returns a negative value. 115 */ 116 int32_t getJamoTMinusBase(const uint8_t *src, const uint8_t *limit) { 117 // Jamo T: E1 86 A8..E1 87 82 118 if ((limit - src) >= 3 && *src == 0xe1) { 119 if (src[1] == 0x86) { 120 uint8_t t = src[2]; 121 // The first Jamo T is U+11A8 but JAMO_T_BASE is 11A7. 122 // Offset 0 does not correspond to any conjoining Jamo. 123 if (0xa8 <= t && t <= 0xbf) { 124 return t - 0xa7; 125 } 126 } else if (src[1] == 0x87) { 127 uint8_t t = src[2]; 128 if (static_cast<int8_t>(t) <= static_cast<int8_t>(0x82u)) { 129 return t - (0xa7 - 0x40); 130 } 131 } 132 } 133 return -1; 134 } 135 136 void 137 appendCodePointDelta(const uint8_t *cpStart, const uint8_t *cpLimit, int32_t delta, 138 ByteSink &sink, Edits *edits) { 139 char buffer[U8_MAX_LENGTH]; 140 int32_t length; 141 int32_t cpLength = static_cast<int32_t>(cpLimit - cpStart); 142 if (cpLength == 1) { 143 // The builder makes ASCII map to ASCII. 144 buffer[0] = static_cast<uint8_t>(*cpStart + delta); 145 length = 1; 146 } else { 147 int32_t trail = *(cpLimit-1) + delta; 148 if (0x80 <= trail && trail <= 0xbf) { 149 // The delta only changes the last trail byte. 150 --cpLimit; 151 length = 0; 152 do { buffer[length++] = *cpStart++; } while (cpStart < cpLimit); 153 buffer[length++] = static_cast<uint8_t>(trail); 154 } else { 155 // Decode the code point, add the delta, re-encode. 156 UChar32 c = codePointFromValidUTF8(cpStart, cpLimit) + delta; 157 length = 0; 158 U8_APPEND_UNSAFE(buffer, length, c); 159 } 160 } 161 if (edits != nullptr) { 162 edits->addReplace(cpLength, length); 163 } 164 sink.Append(buffer, length); 165 } 166 167 } // namespace 168 169 // ReorderingBuffer -------------------------------------------------------- *** 170 171 ReorderingBuffer::ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest, 172 UErrorCode &errorCode) : 173 impl(ni), str(dest), 174 start(str.getBuffer(8)), reorderStart(start), limit(start), 175 remainingCapacity(str.getCapacity()), lastCC(0) { 176 if (start == nullptr && U_SUCCESS(errorCode)) { 177 // getBuffer() already did str.setToBogus() 178 errorCode = U_MEMORY_ALLOCATION_ERROR; 179 } 180 } 181 182 UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) { 183 int32_t length=str.length(); 184 start=str.getBuffer(destCapacity); 185 if(start==nullptr) { 186 // getBuffer() already did str.setToBogus() 187 errorCode=U_MEMORY_ALLOCATION_ERROR; 188 return false; 189 } 190 limit=start+length; 191 remainingCapacity=str.getCapacity()-length; 192 reorderStart=start; 193 if(start==limit) { 194 lastCC=0; 195 } else { 196 setIterator(); 197 lastCC=previousCC(); 198 // Set reorderStart after the last code point with cc<=1 if there is one. 199 if(lastCC>1) { 200 while(previousCC()>1) {} 201 } 202 reorderStart=codePointLimit; 203 } 204 return true; 205 } 206 207 UBool ReorderingBuffer::equals(const char16_t *otherStart, const char16_t *otherLimit) const { 208 int32_t length = static_cast<int32_t>(limit - start); 209 return 210 length == static_cast<int32_t>(otherLimit - otherStart) && 211 0==u_memcmp(start, otherStart, length); 212 } 213 214 UBool ReorderingBuffer::equals(const uint8_t *otherStart, const uint8_t *otherLimit) const { 215 U_ASSERT((otherLimit - otherStart) <= INT32_MAX); // ensured by caller 216 int32_t length = static_cast<int32_t>(limit - start); 217 int32_t otherLength = static_cast<int32_t>(otherLimit - otherStart); 218 // For equal strings, UTF-8 is at least as long as UTF-16, and at most three times as long. 219 if (otherLength < length || (otherLength / 3) > length) { 220 return false; 221 } 222 // Compare valid strings from between normalization boundaries. 223 // (Invalid sequences are normalization-inert.) 224 for (int32_t i = 0, j = 0;;) { 225 if (i >= length) { 226 return j >= otherLength; 227 } else if (j >= otherLength) { 228 return false; 229 } 230 // Not at the end of either string yet. 231 UChar32 c, other; 232 U16_NEXT_UNSAFE(start, i, c); 233 U8_NEXT_UNSAFE(otherStart, j, other); 234 if (c != other) { 235 return false; 236 } 237 } 238 } 239 240 UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) { 241 if(remainingCapacity<2 && !resize(2, errorCode)) { 242 return false; 243 } 244 if(lastCC<=cc || cc==0) { 245 limit[0]=U16_LEAD(c); 246 limit[1]=U16_TRAIL(c); 247 limit+=2; 248 lastCC=cc; 249 if(cc<=1) { 250 reorderStart=limit; 251 } 252 } else { 253 insert(c, cc); 254 } 255 remainingCapacity-=2; 256 return true; 257 } 258 259 UBool ReorderingBuffer::append(const char16_t *s, int32_t length, UBool isNFD, 260 uint8_t leadCC, uint8_t trailCC, 261 UErrorCode &errorCode) { 262 if(length==0) { 263 return true; 264 } 265 if(remainingCapacity<length && !resize(length, errorCode)) { 266 return false; 267 } 268 remainingCapacity-=length; 269 if(lastCC<=leadCC || leadCC==0) { 270 if(trailCC<=1) { 271 reorderStart=limit+length; 272 } else if(leadCC<=1) { 273 reorderStart=limit+1; // Ok if not a code point boundary. 274 } 275 const char16_t *sLimit=s+length; 276 do { *limit++=*s++; } while(s!=sLimit); 277 lastCC=trailCC; 278 } else { 279 int32_t i=0; 280 UChar32 c; 281 U16_NEXT(s, i, length, c); 282 insert(c, leadCC); // insert first code point 283 while(i<length) { 284 U16_NEXT(s, i, length, c); 285 if(i<length) { 286 if (isNFD) { 287 leadCC = Normalizer2Impl::getCCFromYesOrMaybeYes(impl.getRawNorm16(c)); 288 } else { 289 leadCC = impl.getCC(impl.getNorm16(c)); 290 } 291 } else { 292 leadCC=trailCC; 293 } 294 append(c, leadCC, errorCode); 295 } 296 } 297 return true; 298 } 299 300 UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) { 301 int32_t cpLength=U16_LENGTH(c); 302 if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) { 303 return false; 304 } 305 remainingCapacity-=cpLength; 306 if(cpLength==1) { 307 *limit++ = static_cast<char16_t>(c); 308 } else { 309 limit[0]=U16_LEAD(c); 310 limit[1]=U16_TRAIL(c); 311 limit+=2; 312 } 313 lastCC=0; 314 reorderStart=limit; 315 return true; 316 } 317 318 UBool ReorderingBuffer::appendZeroCC(const char16_t *s, const char16_t *sLimit, UErrorCode &errorCode) { 319 if(s==sLimit) { 320 return true; 321 } 322 int32_t length = static_cast<int32_t>(sLimit - s); 323 if(remainingCapacity<length && !resize(length, errorCode)) { 324 return false; 325 } 326 u_memcpy(limit, s, length); 327 limit+=length; 328 remainingCapacity-=length; 329 lastCC=0; 330 reorderStart=limit; 331 return true; 332 } 333 334 void ReorderingBuffer::remove() { 335 reorderStart=limit=start; 336 remainingCapacity=str.getCapacity(); 337 lastCC=0; 338 } 339 340 void ReorderingBuffer::removeSuffix(int32_t suffixLength) { 341 if(suffixLength<(limit-start)) { 342 limit-=suffixLength; 343 remainingCapacity+=suffixLength; 344 } else { 345 limit=start; 346 remainingCapacity=str.getCapacity(); 347 } 348 lastCC=0; 349 reorderStart=limit; 350 } 351 352 UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) { 353 int32_t reorderStartIndex = static_cast<int32_t>(reorderStart - start); 354 int32_t length = static_cast<int32_t>(limit - start); 355 str.releaseBuffer(length); 356 int32_t newCapacity=length+appendLength; 357 int32_t doubleCapacity=2*str.getCapacity(); 358 if(newCapacity<doubleCapacity) { 359 newCapacity=doubleCapacity; 360 } 361 if(newCapacity<256) { 362 newCapacity=256; 363 } 364 start=str.getBuffer(newCapacity); 365 if(start==nullptr) { 366 // getBuffer() already did str.setToBogus() 367 errorCode=U_MEMORY_ALLOCATION_ERROR; 368 return false; 369 } 370 reorderStart=start+reorderStartIndex; 371 limit=start+length; 372 remainingCapacity=str.getCapacity()-length; 373 return true; 374 } 375 376 void ReorderingBuffer::skipPrevious() { 377 codePointLimit=codePointStart; 378 char16_t c=*--codePointStart; 379 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) { 380 --codePointStart; 381 } 382 } 383 384 uint8_t ReorderingBuffer::previousCC() { 385 codePointLimit=codePointStart; 386 if(reorderStart>=codePointStart) { 387 return 0; 388 } 389 UChar32 c=*--codePointStart; 390 char16_t c2; 391 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) { 392 --codePointStart; 393 c=U16_GET_SUPPLEMENTARY(c2, c); 394 } 395 return impl.getCCFromYesOrMaybeYesCP(c); 396 } 397 398 // Inserts c somewhere before the last character. 399 // Requires 0<cc<lastCC which implies reorderStart<limit. 400 void ReorderingBuffer::insert(UChar32 c, uint8_t cc) { 401 for(setIterator(), skipPrevious(); previousCC()>cc;) {} 402 // insert c at codePointLimit, after the character with prevCC<=cc 403 char16_t *q=limit; 404 char16_t *r=limit+=U16_LENGTH(c); 405 do { 406 *--r=*--q; 407 } while(codePointLimit!=q); 408 writeCodePoint(q, c); 409 if(cc<=1) { 410 reorderStart=r; 411 } 412 } 413 414 // Normalizer2Impl --------------------------------------------------------- *** 415 416 struct CanonIterData : public UMemory { 417 CanonIterData(UErrorCode &errorCode); 418 ~CanonIterData(); 419 void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode); 420 UMutableCPTrie *mutableTrie; 421 UCPTrie *trie; 422 UVector canonStartSets; // contains UnicodeSet * 423 }; 424 425 Normalizer2Impl::~Normalizer2Impl() { 426 delete fCanonIterData; 427 } 428 429 void 430 Normalizer2Impl::init(const int32_t *inIndexes, const UCPTrie *inTrie, 431 const uint16_t *inExtraData, const uint8_t *inSmallFCD) { 432 minDecompNoCP = static_cast<char16_t>(inIndexes[IX_MIN_DECOMP_NO_CP]); 433 minCompNoMaybeCP = static_cast<char16_t>(inIndexes[IX_MIN_COMP_NO_MAYBE_CP]); 434 minLcccCP = static_cast<char16_t>(inIndexes[IX_MIN_LCCC_CP]); 435 436 minYesNo = static_cast<uint16_t>(inIndexes[IX_MIN_YES_NO]); 437 minYesNoMappingsOnly = static_cast<uint16_t>(inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]); 438 minNoNo = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO]); 439 minNoNoCompBoundaryBefore = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]); 440 minNoNoCompNoMaybeCC = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC]); 441 minNoNoEmpty = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_EMPTY]); 442 limitNoNo = static_cast<uint16_t>(inIndexes[IX_LIMIT_NO_NO]); 443 minMaybeNo = static_cast<uint16_t>(inIndexes[IX_MIN_MAYBE_NO]); 444 minMaybeNoCombinesFwd = static_cast<uint16_t>(inIndexes[IX_MIN_MAYBE_NO_COMBINES_FWD]); 445 minMaybeYes = static_cast<uint16_t>(inIndexes[IX_MIN_MAYBE_YES]); 446 U_ASSERT((minMaybeNo & 7) == 0); // 8-aligned for noNoDelta bit fields 447 centerNoNoDelta = (minMaybeNo >> DELTA_SHIFT) - MAX_DELTA - 1; 448 449 normTrie=inTrie; 450 extraData=inExtraData; 451 smallFCD=inSmallFCD; 452 } 453 454 U_CDECL_BEGIN 455 456 static uint32_t U_CALLCONV 457 segmentStarterMapper(const void * /*context*/, uint32_t value) { 458 return value&CANON_NOT_SEGMENT_STARTER; 459 } 460 461 U_CDECL_END 462 463 void 464 Normalizer2Impl::addLcccChars(UnicodeSet &set) const { 465 UChar32 start = 0, end; 466 uint32_t norm16; 467 while ((end = ucptrie_getRange(normTrie, start, UCPMAP_RANGE_FIXED_LEAD_SURROGATES, INERT, 468 nullptr, nullptr, &norm16)) >= 0) { 469 if (norm16 > Normalizer2Impl::MIN_NORMAL_MAYBE_YES && 470 norm16 != Normalizer2Impl::JAMO_VT) { 471 set.add(start, end); 472 } else if (minNoNoCompNoMaybeCC <= norm16 && norm16 < limitNoNo) { 473 uint16_t fcd16 = getFCD16(start); 474 if (fcd16 > 0xff) { set.add(start, end); } 475 } 476 start = end + 1; 477 } 478 } 479 480 void 481 Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const { 482 // Add the start code point of each same-value range of the trie. 483 UChar32 start = 0, end; 484 uint32_t value; 485 while ((end = ucptrie_getRange(normTrie, start, UCPMAP_RANGE_FIXED_LEAD_SURROGATES, INERT, 486 nullptr, nullptr, &value)) >= 0) { 487 sa->add(sa->set, start); 488 if (start != end && isAlgorithmicNoNo(static_cast<uint16_t>(value)) && 489 (value & Normalizer2Impl::DELTA_TCCC_MASK) > Normalizer2Impl::DELTA_TCCC_1) { 490 // Range of code points with same-norm16-value algorithmic decompositions. 491 // They might have different non-zero FCD16 values. 492 uint16_t prevFCD16 = getFCD16(start); 493 while (++start <= end) { 494 uint16_t fcd16 = getFCD16(start); 495 if (fcd16 != prevFCD16) { 496 sa->add(sa->set, start); 497 prevFCD16 = fcd16; 498 } 499 } 500 } 501 start = end + 1; 502 } 503 504 /* add Hangul LV syllables and LV+1 because of skippables */ 505 for(char16_t c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) { 506 sa->add(sa->set, c); 507 sa->add(sa->set, c+1); 508 } 509 sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */ 510 } 511 512 void 513 Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const { 514 // Add the start code point of each same-value range of the canonical iterator data trie. 515 if (!ensureCanonIterData(errorCode)) { return; } 516 // Currently only used for the SEGMENT_STARTER property. 517 UChar32 start = 0, end; 518 uint32_t value; 519 while ((end = ucptrie_getRange(fCanonIterData->trie, start, UCPMAP_RANGE_NORMAL, 0, 520 segmentStarterMapper, nullptr, &value)) >= 0) { 521 sa->add(sa->set, start); 522 start = end + 1; 523 } 524 } 525 526 const char16_t * 527 Normalizer2Impl::copyLowPrefixFromNulTerminated(const char16_t *src, 528 UChar32 minNeedDataCP, 529 ReorderingBuffer *buffer, 530 UErrorCode &errorCode) const { 531 // Make some effort to support NUL-terminated strings reasonably. 532 // Take the part of the fast quick check loop that does not look up 533 // data and check the first part of the string. 534 // After this prefix, determine the string length to simplify the rest 535 // of the code. 536 const char16_t *prevSrc=src; 537 char16_t c; 538 while((c=*src++)<minNeedDataCP && c!=0) {} 539 // Back out the last character for full processing. 540 // Copy this prefix. 541 if(--src!=prevSrc) { 542 if(buffer!=nullptr) { 543 buffer->appendZeroCC(prevSrc, src, errorCode); 544 } 545 } 546 return src; 547 } 548 549 UnicodeString & 550 Normalizer2Impl::decompose(const UnicodeString &src, UnicodeString &dest, 551 UErrorCode &errorCode) const { 552 if(U_FAILURE(errorCode)) { 553 dest.setToBogus(); 554 return dest; 555 } 556 const char16_t *sArray=src.getBuffer(); 557 if(&dest==&src || sArray==nullptr) { 558 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 559 dest.setToBogus(); 560 return dest; 561 } 562 decompose(sArray, sArray+src.length(), dest, src.length(), errorCode); 563 return dest; 564 } 565 566 void 567 Normalizer2Impl::decompose(const char16_t *src, const char16_t *limit, 568 UnicodeString &dest, 569 int32_t destLengthEstimate, 570 UErrorCode &errorCode) const { 571 if(destLengthEstimate<0 && limit!=nullptr) { 572 destLengthEstimate = static_cast<int32_t>(limit - src); 573 } 574 dest.remove(); 575 ReorderingBuffer buffer(*this, dest); 576 if(buffer.init(destLengthEstimate, errorCode)) { 577 decompose(src, limit, &buffer, errorCode); 578 } 579 } 580 581 // Dual functionality: 582 // buffer!=nullptr: normalize 583 // buffer==nullptr: isNormalized/spanQuickCheckYes 584 const char16_t * 585 Normalizer2Impl::decompose(const char16_t *src, const char16_t *limit, 586 ReorderingBuffer *buffer, 587 UErrorCode &errorCode) const { 588 UChar32 minNoCP=minDecompNoCP; 589 if(limit==nullptr) { 590 src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode); 591 if(U_FAILURE(errorCode)) { 592 return src; 593 } 594 limit=u_strchr(src, 0); 595 } 596 597 const char16_t *prevSrc; 598 UChar32 c=0; 599 uint16_t norm16=0; 600 601 // only for quick check 602 const char16_t *prevBoundary=src; 603 uint8_t prevCC=0; 604 605 for(;;) { 606 // count code units below the minimum or with irrelevant data for the quick check 607 for(prevSrc=src; src!=limit;) { 608 if( (c=*src)<minNoCP || 609 isMostDecompYesAndZeroCC(norm16=UCPTRIE_FAST_BMP_GET(normTrie, UCPTRIE_16, c)) 610 ) { 611 ++src; 612 } else if(!U16_IS_LEAD(c)) { 613 break; 614 } else { 615 char16_t c2; 616 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 617 c=U16_GET_SUPPLEMENTARY(c, c2); 618 norm16=UCPTRIE_FAST_SUPP_GET(normTrie, UCPTRIE_16, c); 619 if(isMostDecompYesAndZeroCC(norm16)) { 620 src+=2; 621 } else { 622 break; 623 } 624 } else { 625 ++src; // unpaired lead surrogate: inert 626 } 627 } 628 } 629 // copy these code units all at once 630 if(src!=prevSrc) { 631 if(buffer!=nullptr) { 632 if(!buffer->appendZeroCC(prevSrc, src, errorCode)) { 633 break; 634 } 635 } else { 636 prevCC=0; 637 prevBoundary=src; 638 } 639 } 640 if(src==limit) { 641 break; 642 } 643 644 // Check one above-minimum, relevant code point. 645 src+=U16_LENGTH(c); 646 if(buffer!=nullptr) { 647 if(!decompose(c, norm16, *buffer, errorCode)) { 648 break; 649 } 650 } else { 651 if(isDecompYes(norm16)) { 652 uint8_t cc=getCCFromYesOrMaybeYes(norm16); 653 if(prevCC<=cc || cc==0) { 654 prevCC=cc; 655 if(cc<=1) { 656 prevBoundary=src; 657 } 658 continue; 659 } 660 } 661 return prevBoundary; // "no" or cc out of order 662 } 663 } 664 return src; 665 } 666 667 // Decompose a short piece of text which is likely to contain characters that 668 // fail the quick check loop and/or where the quick check loop's overhead 669 // is unlikely to be amortized. 670 // Called by the compose() and makeFCD() implementations. 671 const char16_t * 672 Normalizer2Impl::decomposeShort(const char16_t *src, const char16_t *limit, 673 UBool stopAtCompBoundary, UBool onlyContiguous, 674 ReorderingBuffer &buffer, UErrorCode &errorCode) const { 675 if (U_FAILURE(errorCode)) { 676 return nullptr; 677 } 678 while(src<limit) { 679 if (stopAtCompBoundary && *src < minCompNoMaybeCP) { 680 return src; 681 } 682 const char16_t *prevSrc = src; 683 UChar32 c; 684 uint16_t norm16; 685 UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, src, limit, c, norm16); 686 if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) { 687 return prevSrc; 688 } 689 if(!decompose(c, norm16, buffer, errorCode)) { 690 return nullptr; 691 } 692 if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 693 return src; 694 } 695 } 696 return src; 697 } 698 699 UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16, 700 ReorderingBuffer &buffer, 701 UErrorCode &errorCode) const { 702 // get the decomposition and the lead and trail cc's 703 if (norm16 >= limitNoNo) { 704 if (isMaybeYesOrNonZeroCC(norm16)) { 705 return buffer.append(c, getCCFromYesOrMaybeYes(norm16), errorCode); 706 } else if (norm16 < minMaybeNo) { 707 // Maps to an isCompYesAndZeroCC. 708 c=mapAlgorithmic(c, norm16); 709 norm16=getRawNorm16(c); 710 } 711 } 712 if (norm16 < minYesNo) { 713 // c does not decompose 714 return buffer.append(c, 0, errorCode); 715 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { 716 // Hangul syllable: decompose algorithmically 717 char16_t jamos[3]; 718 return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode); 719 } 720 // c decomposes, get everything from the variable-length extra data 721 const uint16_t *mapping=getData(norm16); 722 uint16_t firstUnit=*mapping; 723 int32_t length=firstUnit&MAPPING_LENGTH_MASK; 724 uint8_t leadCC, trailCC; 725 trailCC = static_cast<uint8_t>(firstUnit >> 8); 726 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { 727 leadCC = static_cast<uint8_t>(*(mapping - 1) >> 8); 728 } else { 729 leadCC=0; 730 } 731 return buffer.append(reinterpret_cast<const char16_t*>(mapping) + 1, length, true, leadCC, trailCC, errorCode); 732 } 733 734 // Dual functionality: 735 // sink != nullptr: normalize 736 // sink == nullptr: isNormalized/spanQuickCheckYes 737 const uint8_t * 738 Normalizer2Impl::decomposeUTF8(uint32_t options, 739 const uint8_t *src, const uint8_t *limit, 740 ByteSink *sink, Edits *edits, UErrorCode &errorCode) const { 741 U_ASSERT(limit != nullptr); 742 UnicodeString s16; 743 uint8_t minNoLead = leadByteForCP(minDecompNoCP); 744 745 const uint8_t *prevBoundary = src; 746 // only for quick check 747 uint8_t prevCC = 0; 748 749 for (;;) { 750 // Fast path: Scan over a sequence of characters below the minimum "no" code point, 751 // or with (decompYes && ccc==0) properties. 752 const uint8_t *fastStart = src; 753 const uint8_t *prevSrc; 754 uint16_t norm16 = 0; 755 756 for (;;) { 757 if (src == limit) { 758 if (prevBoundary != limit && sink != nullptr) { 759 ByteSinkUtil::appendUnchanged(prevBoundary, limit, 760 *sink, options, edits, errorCode); 761 } 762 return src; 763 } 764 if (*src < minNoLead) { 765 ++src; 766 } else { 767 prevSrc = src; 768 UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16); 769 if (!isMostDecompYesAndZeroCC(norm16)) { 770 break; 771 } 772 } 773 } 774 // isMostDecompYesAndZeroCC(norm16) is false, that is, norm16>=minYesNo, 775 // and the current character at [prevSrc..src[ is not a common case with cc=0 776 // (MIN_NORMAL_MAYBE_YES or JAMO_VT). 777 // It could still be a maybeYes with cc=0. 778 if (prevSrc != fastStart) { 779 // The fast path looped over yes/0 characters before the current one. 780 if (sink != nullptr && 781 !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc, 782 *sink, options, edits, errorCode)) { 783 break; 784 } 785 prevBoundary = prevSrc; 786 prevCC = 0; 787 } 788 789 // Medium-fast path: Quick check. 790 if (isMaybeYesOrNonZeroCC(norm16)) { 791 // Does not decompose. 792 uint8_t cc = getCCFromYesOrMaybeYes(norm16); 793 if (prevCC <= cc || cc == 0) { 794 prevCC = cc; 795 if (cc <= 1) { 796 if (sink != nullptr && 797 !ByteSinkUtil::appendUnchanged(prevBoundary, src, 798 *sink, options, edits, errorCode)) { 799 break; 800 } 801 prevBoundary = src; 802 } 803 continue; 804 } 805 } 806 if (sink == nullptr) { 807 return prevBoundary; // quick check: "no" or cc out of order 808 } 809 810 // Slow path 811 // Decompose up to and including the current character. 812 if (prevBoundary != prevSrc && norm16HasDecompBoundaryBefore(norm16)) { 813 if (!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc, 814 *sink, options, edits, errorCode)) { 815 break; 816 } 817 prevBoundary = prevSrc; 818 } 819 ReorderingBuffer buffer(*this, s16, errorCode); 820 if (U_FAILURE(errorCode)) { 821 break; 822 } 823 decomposeShort(prevBoundary, src, STOP_AT_LIMIT, false /* onlyContiguous */, 824 buffer, errorCode); 825 // Decompose until the next boundary. 826 if (buffer.getLastCC() > 1) { 827 src = decomposeShort(src, limit, STOP_AT_DECOMP_BOUNDARY, false /* onlyContiguous */, 828 buffer, errorCode); 829 } 830 if (U_FAILURE(errorCode)) { 831 break; 832 } 833 if ((src - prevSrc) > INT32_MAX) { // guard before buffer.equals() 834 errorCode = U_INDEX_OUTOFBOUNDS_ERROR; 835 break; 836 } 837 // We already know there was a change if the original character decomposed; 838 // otherwise compare. 839 if (isMaybeYesOrNonZeroCC(norm16) && buffer.equals(prevBoundary, src)) { 840 if (!ByteSinkUtil::appendUnchanged(prevBoundary, src, 841 *sink, options, edits, errorCode)) { 842 break; 843 } 844 } else { 845 if (!ByteSinkUtil::appendChange(prevBoundary, src, buffer.getStart(), buffer.length(), 846 *sink, edits, errorCode)) { 847 break; 848 } 849 } 850 prevBoundary = src; 851 prevCC = 0; 852 } 853 return src; 854 } 855 856 const uint8_t * 857 Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit, 858 StopAt stopAt, UBool onlyContiguous, 859 ReorderingBuffer &buffer, UErrorCode &errorCode) const { 860 if (U_FAILURE(errorCode)) { 861 return nullptr; 862 } 863 while (src < limit) { 864 const uint8_t *prevSrc = src; 865 uint16_t norm16; 866 UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16); 867 // Get the decomposition and the lead and trail cc's. 868 UChar32 c = U_SENTINEL; 869 if (norm16 >= limitNoNo) { 870 if (isMaybeYesOrNonZeroCC(norm16)) { 871 // No comp boundaries around this character. 872 uint8_t cc = getCCFromYesOrMaybeYes(norm16); 873 if (cc == 0 && stopAt == STOP_AT_DECOMP_BOUNDARY) { 874 return prevSrc; 875 } 876 c = codePointFromValidUTF8(prevSrc, src); 877 if (!buffer.append(c, cc, errorCode)) { 878 return nullptr; 879 } 880 if (stopAt == STOP_AT_DECOMP_BOUNDARY && buffer.getLastCC() <= 1) { 881 return src; 882 } 883 continue; 884 } else if (norm16 < minMaybeNo) { 885 // Maps to an isCompYesAndZeroCC. 886 if (stopAt != STOP_AT_LIMIT) { 887 return prevSrc; 888 } 889 c = codePointFromValidUTF8(prevSrc, src); 890 c = mapAlgorithmic(c, norm16); 891 norm16 = getRawNorm16(c); 892 } 893 } else if (stopAt != STOP_AT_LIMIT && norm16 < minNoNoCompNoMaybeCC) { 894 return prevSrc; 895 } 896 // norm16!=INERT guarantees that [prevSrc, src[ is valid UTF-8. 897 // We do not see invalid UTF-8 here because 898 // its norm16==INERT is normalization-inert, 899 // so it gets copied unchanged in the fast path, 900 // and we stop the slow path where invalid UTF-8 begins. 901 // c >= 0 is the result of an algorithmic mapping. 902 U_ASSERT(c >= 0 || norm16 != INERT); 903 if (norm16 < minYesNo) { 904 if (c < 0) { 905 c = codePointFromValidUTF8(prevSrc, src); 906 } 907 // does not decompose 908 if (!buffer.append(c, 0, errorCode)) { 909 return nullptr; 910 } 911 } else if (isHangulLV(norm16) || isHangulLVT(norm16)) { 912 // Hangul syllable: decompose algorithmically 913 if (c < 0) { 914 c = codePointFromValidUTF8(prevSrc, src); 915 } 916 char16_t jamos[3]; 917 if (!buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode)) { 918 return nullptr; 919 } 920 } else { 921 // The character decomposes, get everything from the variable-length extra data. 922 const uint16_t *mapping = getData(norm16); 923 uint16_t firstUnit = *mapping; 924 int32_t length = firstUnit & MAPPING_LENGTH_MASK; 925 uint8_t trailCC = static_cast<uint8_t>(firstUnit >> 8); 926 uint8_t leadCC; 927 if (firstUnit & MAPPING_HAS_CCC_LCCC_WORD) { 928 leadCC = static_cast<uint8_t>(*(mapping - 1) >> 8); 929 } else { 930 leadCC = 0; 931 } 932 if (leadCC == 0 && stopAt == STOP_AT_DECOMP_BOUNDARY) { 933 return prevSrc; 934 } 935 if (!buffer.append(reinterpret_cast<const char16_t*>(mapping) + 1, length, true, leadCC, trailCC, errorCode)) { 936 return nullptr; 937 } 938 } 939 if ((stopAt == STOP_AT_COMP_BOUNDARY && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) || 940 (stopAt == STOP_AT_DECOMP_BOUNDARY && buffer.getLastCC() <= 1)) { 941 return src; 942 } 943 } 944 return src; 945 } 946 947 const char16_t * 948 Normalizer2Impl::getDecomposition(UChar32 c, char16_t buffer[4], int32_t &length) const { 949 uint16_t norm16; 950 if(c<minDecompNoCP || isMaybeYesOrNonZeroCC(norm16=getNorm16(c))) { 951 // c does not decompose 952 return nullptr; 953 } 954 const char16_t *decomp = nullptr; 955 if(isDecompNoAlgorithmic(norm16)) { 956 // Maps to an isCompYesAndZeroCC. 957 c=mapAlgorithmic(c, norm16); 958 decomp=buffer; 959 length=0; 960 U16_APPEND_UNSAFE(buffer, length, c); 961 // The mapping might decompose further. 962 norm16 = getRawNorm16(c); 963 } 964 if (norm16 < minYesNo) { 965 return decomp; 966 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { 967 // Hangul syllable: decompose algorithmically 968 length=Hangul::decompose(c, buffer); 969 return buffer; 970 } 971 // c decomposes, get everything from the variable-length extra data 972 const uint16_t *mapping=getData(norm16); 973 length=*mapping&MAPPING_LENGTH_MASK; 974 return reinterpret_cast<const char16_t*>(mapping) + 1; 975 } 976 977 // The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1 978 // so that a raw mapping fits that consists of one unit ("rm0") 979 // plus all but the first two code units of the normal mapping. 980 // The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK. 981 const char16_t * 982 Normalizer2Impl::getRawDecomposition(UChar32 c, char16_t buffer[30], int32_t &length) const { 983 uint16_t norm16; 984 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { 985 // c does not decompose 986 return nullptr; 987 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { 988 // Hangul syllable: decompose algorithmically 989 Hangul::getRawDecomposition(c, buffer); 990 length=2; 991 return buffer; 992 } else if(isDecompNoAlgorithmic(norm16)) { 993 c=mapAlgorithmic(c, norm16); 994 length=0; 995 U16_APPEND_UNSAFE(buffer, length, c); 996 return buffer; 997 } 998 // c decomposes, get everything from the variable-length extra data 999 const uint16_t *mapping=getData(norm16); 1000 uint16_t firstUnit=*mapping; 1001 int32_t mLength=firstUnit&MAPPING_LENGTH_MASK; // length of normal mapping 1002 if(firstUnit&MAPPING_HAS_RAW_MAPPING) { 1003 // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word. 1004 // Bit 7=MAPPING_HAS_CCC_LCCC_WORD 1005 const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1; 1006 uint16_t rm0=*rawMapping; 1007 if(rm0<=MAPPING_LENGTH_MASK) { 1008 length=rm0; 1009 return reinterpret_cast<const char16_t*>(rawMapping) - rm0; 1010 } else { 1011 // Copy the normal mapping and replace its first two code units with rm0. 1012 buffer[0] = static_cast<char16_t>(rm0); 1013 u_memcpy(buffer + 1, reinterpret_cast<const char16_t*>(mapping) + 1 + 2, mLength - 2); 1014 length=mLength-1; 1015 return buffer; 1016 } 1017 } else { 1018 length=mLength; 1019 return reinterpret_cast<const char16_t*>(mapping) + 1; 1020 } 1021 } 1022 1023 void Normalizer2Impl::decomposeAndAppend(const char16_t *src, const char16_t *limit, 1024 UBool doDecompose, 1025 UnicodeString &safeMiddle, 1026 ReorderingBuffer &buffer, 1027 UErrorCode &errorCode) const { 1028 buffer.copyReorderableSuffixTo(safeMiddle); 1029 if(doDecompose) { 1030 decompose(src, limit, &buffer, errorCode); 1031 return; 1032 } 1033 // Just merge the strings at the boundary. 1034 bool isFirst = true; 1035 uint8_t firstCC = 0, prevCC = 0, cc; 1036 const char16_t *p = src; 1037 while (p != limit) { 1038 const char16_t *codePointStart = p; 1039 UChar32 c; 1040 uint16_t norm16; 1041 UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16); 1042 if ((cc = getCC(norm16)) == 0) { 1043 p = codePointStart; 1044 break; 1045 } 1046 if (isFirst) { 1047 firstCC = cc; 1048 isFirst = false; 1049 } 1050 prevCC = cc; 1051 } 1052 if(limit==nullptr) { // appendZeroCC() needs limit!=nullptr 1053 limit=u_strchr(p, 0); 1054 } 1055 1056 if (buffer.append(src, static_cast<int32_t>(p - src), false, firstCC, prevCC, errorCode)) { 1057 buffer.appendZeroCC(p, limit, errorCode); 1058 } 1059 } 1060 1061 UBool Normalizer2Impl::hasDecompBoundaryBefore(UChar32 c) const { 1062 return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) || 1063 norm16HasDecompBoundaryBefore(getNorm16(c)); 1064 } 1065 1066 UBool Normalizer2Impl::norm16HasDecompBoundaryBefore(uint16_t norm16) const { 1067 if (norm16 < minNoNoCompNoMaybeCC) { 1068 return true; 1069 } 1070 if (norm16 >= limitNoNo) { 1071 return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; 1072 } 1073 // c decomposes, get everything from the variable-length extra data 1074 const uint16_t *mapping=getDataForYesOrNo(norm16); 1075 uint16_t firstUnit=*mapping; 1076 // true if leadCC==0 (hasFCDBoundaryBefore()) 1077 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0; 1078 } 1079 1080 UBool Normalizer2Impl::hasDecompBoundaryAfter(UChar32 c) const { 1081 if (c < minDecompNoCP) { 1082 return true; 1083 } 1084 if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) { 1085 return true; 1086 } 1087 return norm16HasDecompBoundaryAfter(getNorm16(c)); 1088 } 1089 1090 UBool Normalizer2Impl::norm16HasDecompBoundaryAfter(uint16_t norm16) const { 1091 if(norm16 <= minYesNo || isHangulLVT(norm16)) { 1092 return true; 1093 } 1094 if (norm16 >= limitNoNo) { 1095 if (isMaybeYesOrNonZeroCC(norm16)) { 1096 return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; 1097 } else if (norm16 < minMaybeNo) { 1098 // Maps to an isCompYesAndZeroCC. 1099 return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1; 1100 } 1101 } 1102 // c decomposes, get everything from the variable-length extra data 1103 const uint16_t *mapping=getData(norm16); 1104 uint16_t firstUnit=*mapping; 1105 // decomp after-boundary: same as hasFCDBoundaryAfter(), 1106 // fcd16<=1 || trailCC==0 1107 if(firstUnit>0x1ff) { 1108 return false; // trailCC>1 1109 } 1110 if(firstUnit<=0xff) { 1111 return true; // trailCC==0 1112 } 1113 // if(trailCC==1) test leadCC==0, same as checking for before-boundary 1114 // true if leadCC==0 (hasFCDBoundaryBefore()) 1115 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0; 1116 } 1117 1118 /* 1119 * Finds the recomposition result for 1120 * a forward-combining "lead" character, 1121 * specified with a pointer to its compositions list, 1122 * and a backward-combining "trail" character. 1123 * 1124 * If the lead and trail characters combine, then this function returns 1125 * the following "compositeAndFwd" value: 1126 * Bits 21..1 composite character 1127 * Bit 0 set if the composite is a forward-combining starter 1128 * otherwise it returns -1. 1129 * 1130 * The compositions list has (trail, compositeAndFwd) pair entries, 1131 * encoded as either pairs or triples of 16-bit units. 1132 * The last entry has the high bit of its first unit set. 1133 * 1134 * The list is sorted by ascending trail characters (there are no duplicates). 1135 * A linear search is used. 1136 * 1137 * See normalizer2impl.h for a more detailed description 1138 * of the compositions list format. 1139 */ 1140 int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) { 1141 uint16_t key1, firstUnit; 1142 if(trail<COMP_1_TRAIL_LIMIT) { 1143 // trail character is 0..33FF 1144 // result entry may have 2 or 3 units 1145 key1 = static_cast<uint16_t>(trail << 1); 1146 while(key1>(firstUnit=*list)) { 1147 list+=2+(firstUnit&COMP_1_TRIPLE); 1148 } 1149 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 1150 if(firstUnit&COMP_1_TRIPLE) { 1151 return (static_cast<int32_t>(list[1]) << 16) | list[2]; 1152 } else { 1153 return list[1]; 1154 } 1155 } 1156 } else { 1157 // trail character is 3400..10FFFF 1158 // result entry has 3 units 1159 key1 = static_cast<uint16_t>(COMP_1_TRAIL_LIMIT + 1160 (((trail>>COMP_1_TRAIL_SHIFT))& 1161 ~COMP_1_TRIPLE)); 1162 uint16_t key2 = static_cast<uint16_t>(trail << COMP_2_TRAIL_SHIFT); 1163 uint16_t secondUnit; 1164 for(;;) { 1165 if(key1>(firstUnit=*list)) { 1166 list+=2+(firstUnit&COMP_1_TRIPLE); 1167 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 1168 if(key2>(secondUnit=list[1])) { 1169 if(firstUnit&COMP_1_LAST_TUPLE) { 1170 break; 1171 } else { 1172 list+=3; 1173 } 1174 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { 1175 return (static_cast<int32_t>(secondUnit & ~COMP_2_TRAIL_MASK) << 16) | list[2]; 1176 } else { 1177 break; 1178 } 1179 } else { 1180 break; 1181 } 1182 } 1183 } 1184 return -1; 1185 } 1186 1187 /** 1188 * @param list some character's compositions list 1189 * @param set recursively receives the composites from these compositions 1190 */ 1191 void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const { 1192 uint16_t firstUnit; 1193 int32_t compositeAndFwd; 1194 do { 1195 firstUnit=*list; 1196 if((firstUnit&COMP_1_TRIPLE)==0) { 1197 compositeAndFwd=list[1]; 1198 list+=2; 1199 } else { 1200 compositeAndFwd = ((static_cast<int32_t>(list[1]) & ~COMP_2_TRAIL_MASK) << 16) | list[2]; 1201 list+=3; 1202 } 1203 UChar32 composite=compositeAndFwd>>1; 1204 if((compositeAndFwd&1)!=0) { 1205 addComposites(getCompositionsListForComposite(getRawNorm16(composite)), set); 1206 } 1207 set.add(composite); 1208 } while((firstUnit&COMP_1_LAST_TUPLE)==0); 1209 } 1210 1211 /* 1212 * Recomposes the buffer text starting at recomposeStartIndex 1213 * (which is in NFD - decomposed and canonically ordered), 1214 * and truncates the buffer contents. 1215 * 1216 * Note that recomposition never lengthens the text: 1217 * Any character consists of either one or two code units; 1218 * a composition may contain at most one more code unit than the original starter, 1219 * while the combining mark that is removed has at least one code unit. 1220 */ 1221 void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex, 1222 UBool onlyContiguous) const { 1223 char16_t *p=buffer.getStart()+recomposeStartIndex; 1224 char16_t *limit=buffer.getLimit(); 1225 if(p==limit) { 1226 return; 1227 } 1228 1229 char16_t *starter, *pRemove, *q, *r; 1230 const uint16_t *compositionsList; 1231 UChar32 c, compositeAndFwd; 1232 uint16_t norm16; 1233 uint8_t cc, prevCC; 1234 UBool starterIsSupplementary; 1235 1236 // Some of the following variables are not used until we have a forward-combining starter 1237 // and are only initialized now to avoid compiler warnings. 1238 compositionsList=nullptr; // used as indicator for whether we have a forward-combining starter 1239 starter=nullptr; 1240 starterIsSupplementary=false; 1241 prevCC=0; 1242 1243 for(;;) { 1244 UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16); 1245 cc=getCCFromYesOrMaybeYes(norm16); 1246 if( // this character combines backward and 1247 isMaybe(norm16) && 1248 // we have seen a starter that combines forward and 1249 compositionsList!=nullptr && 1250 // the backward-combining character is not blocked 1251 (prevCC<cc || prevCC==0) 1252 ) { 1253 if(isJamoVT(norm16)) { 1254 // c is a Jamo V/T, see if we can compose it with the previous character. 1255 if(c<Hangul::JAMO_T_BASE) { 1256 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 1257 char16_t prev = static_cast<char16_t>(*starter - Hangul::JAMO_L_BASE); 1258 if(prev<Hangul::JAMO_L_COUNT) { 1259 pRemove=p-1; 1260 char16_t syllable = static_cast<char16_t>( 1261 Hangul::HANGUL_BASE + 1262 (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))* 1263 Hangul::JAMO_T_COUNT); 1264 char16_t t; 1265 if (p != limit && (t = static_cast<char16_t>(*p - Hangul::JAMO_T_BASE)) < Hangul::JAMO_T_COUNT) { 1266 ++p; 1267 syllable+=t; // The next character was a Jamo T. 1268 } 1269 *starter=syllable; 1270 // remove the Jamo V/T 1271 q=pRemove; 1272 r=p; 1273 while(r<limit) { 1274 *q++=*r++; 1275 } 1276 limit=q; 1277 p=pRemove; 1278 } 1279 } 1280 /* 1281 * No "else" for Jamo T: 1282 * Since the input is in NFD, there are no Hangul LV syllables that 1283 * a Jamo T could combine with. 1284 * All Jamo Ts are combined above when handling Jamo Vs. 1285 */ 1286 if(p==limit) { 1287 break; 1288 } 1289 compositionsList=nullptr; 1290 continue; 1291 } else if((compositeAndFwd=combine(compositionsList, c))>=0) { 1292 // The starter and the combining mark (c) do combine. 1293 UChar32 composite=compositeAndFwd>>1; 1294 1295 // Replace the starter with the composite, remove the combining mark. 1296 pRemove=p-U16_LENGTH(c); // pRemove & p: start & limit of the combining mark 1297 if(starterIsSupplementary) { 1298 if(U_IS_SUPPLEMENTARY(composite)) { 1299 // both are supplementary 1300 starter[0]=U16_LEAD(composite); 1301 starter[1]=U16_TRAIL(composite); 1302 } else { 1303 *starter = static_cast<char16_t>(composite); 1304 // The composite is shorter than the starter, 1305 // move the intermediate characters forward one. 1306 starterIsSupplementary=false; 1307 q=starter+1; 1308 r=q+1; 1309 while(r<pRemove) { 1310 *q++=*r++; 1311 } 1312 --pRemove; 1313 } 1314 } else if(U_IS_SUPPLEMENTARY(composite)) { 1315 // The composite is longer than the starter, 1316 // move the intermediate characters back one. 1317 starterIsSupplementary=true; 1318 ++starter; // temporarily increment for the loop boundary 1319 q=pRemove; 1320 r=++pRemove; 1321 while(starter<q) { 1322 *--r=*--q; 1323 } 1324 *starter=U16_TRAIL(composite); 1325 *--starter=U16_LEAD(composite); // undo the temporary increment 1326 } else { 1327 // both are on the BMP 1328 *starter = static_cast<char16_t>(composite); 1329 } 1330 1331 /* remove the combining mark by moving the following text over it */ 1332 if(pRemove<p) { 1333 q=pRemove; 1334 r=p; 1335 while(r<limit) { 1336 *q++=*r++; 1337 } 1338 limit=q; 1339 p=pRemove; 1340 } 1341 // Keep prevCC because we removed the combining mark. 1342 1343 if(p==limit) { 1344 break; 1345 } 1346 // Is the composite a starter that combines forward? 1347 if(compositeAndFwd&1) { 1348 compositionsList= 1349 getCompositionsListForComposite(getRawNorm16(composite)); 1350 } else { 1351 compositionsList=nullptr; 1352 } 1353 1354 // We combined; continue with looking for compositions. 1355 continue; 1356 } 1357 } 1358 1359 // no combination this time 1360 prevCC=cc; 1361 if(p==limit) { 1362 break; 1363 } 1364 1365 // If c did not combine, then check if it is a starter. 1366 if(cc==0) { 1367 // Found a new starter. 1368 if((compositionsList=getCompositionsListForDecompYes(norm16))!=nullptr) { 1369 // It may combine with something, prepare for it. 1370 if(U_IS_BMP(c)) { 1371 starterIsSupplementary=false; 1372 starter=p-1; 1373 } else { 1374 starterIsSupplementary=true; 1375 starter=p-2; 1376 } 1377 } 1378 } else if(onlyContiguous) { 1379 // FCC: no discontiguous compositions; any intervening character blocks. 1380 compositionsList=nullptr; 1381 } 1382 } 1383 buffer.setReorderingLimit(limit); 1384 } 1385 1386 UChar32 1387 Normalizer2Impl::composePair(UChar32 a, UChar32 b) const { 1388 uint16_t norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16 1389 const uint16_t *list; 1390 if(isInert(norm16)) { 1391 return U_SENTINEL; 1392 } else if(norm16<minYesNoMappingsOnly) { 1393 // a combines forward. 1394 if(isJamoL(norm16)) { 1395 if (b < Hangul::JAMO_V_BASE) { 1396 return U_SENTINEL; 1397 } 1398 b-=Hangul::JAMO_V_BASE; 1399 if(b<Hangul::JAMO_V_COUNT) { 1400 return 1401 (Hangul::HANGUL_BASE+ 1402 ((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)* 1403 Hangul::JAMO_T_COUNT); 1404 } else { 1405 return U_SENTINEL; 1406 } 1407 } else if(isHangulLV(norm16)) { 1408 if (b <= Hangul::JAMO_T_BASE) { 1409 return U_SENTINEL; 1410 } 1411 b-=Hangul::JAMO_T_BASE; 1412 if(b<Hangul::JAMO_T_COUNT) { // not b==0! 1413 return a+b; 1414 } else { 1415 return U_SENTINEL; 1416 } 1417 } else { 1418 // 'a' has a compositions list in extraData 1419 list=getDataForYesOrNo(norm16); 1420 if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list 1421 list+= // mapping pointer 1422 1+ // +1 to skip the first unit with the mapping length 1423 (*list&MAPPING_LENGTH_MASK); // + mapping length 1424 } 1425 } 1426 } else if(norm16<minMaybeNoCombinesFwd || MIN_NORMAL_MAYBE_YES<=norm16) { 1427 return U_SENTINEL; 1428 } else { 1429 list=getDataForMaybe(norm16); 1430 if(norm16<minMaybeYes) { // composite 'a' has both mapping & compositions list 1431 list+= // mapping pointer 1432 1+ // +1 to skip the first unit with the mapping length 1433 (*list&MAPPING_LENGTH_MASK); // + mapping length 1434 } 1435 } 1436 if(b<0 || 0x10ffff<b) { // combine(list, b) requires a valid code point b 1437 return U_SENTINEL; 1438 } 1439 #if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC 1440 return combine(list, b)>>1; 1441 #else 1442 int32_t compositeAndFwd=combine(list, b); 1443 return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL; 1444 #endif 1445 } 1446 1447 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. 1448 // doCompose: normalize 1449 // !doCompose: isNormalized (buffer must be empty and initialized) 1450 UBool 1451 Normalizer2Impl::compose(const char16_t *src, const char16_t *limit, 1452 UBool onlyContiguous, 1453 UBool doCompose, 1454 ReorderingBuffer &buffer, 1455 UErrorCode &errorCode) const { 1456 const char16_t *prevBoundary=src; 1457 UChar32 minNoMaybeCP=minCompNoMaybeCP; 1458 if(limit==nullptr) { 1459 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, 1460 doCompose ? &buffer : nullptr, 1461 errorCode); 1462 if(U_FAILURE(errorCode)) { 1463 return false; 1464 } 1465 limit=u_strchr(src, 0); 1466 if (prevBoundary != src) { 1467 if (hasCompBoundaryAfter(*(src-1), onlyContiguous)) { 1468 prevBoundary = src; 1469 } else { 1470 buffer.removeSuffix(1); 1471 prevBoundary = --src; 1472 } 1473 } 1474 } 1475 1476 for (;;) { 1477 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, 1478 // or with (compYes && ccc==0) properties. 1479 const char16_t *prevSrc; 1480 UChar32 c = 0; 1481 uint16_t norm16 = 0; 1482 for (;;) { 1483 if (src == limit) { 1484 if (prevBoundary != limit && doCompose) { 1485 buffer.appendZeroCC(prevBoundary, limit, errorCode); 1486 } 1487 return true; 1488 } 1489 if( (c=*src)<minNoMaybeCP || 1490 isCompYesAndZeroCC(norm16=UCPTRIE_FAST_BMP_GET(normTrie, UCPTRIE_16, c)) 1491 ) { 1492 ++src; 1493 } else { 1494 prevSrc = src++; 1495 if(!U16_IS_LEAD(c)) { 1496 break; 1497 } else { 1498 char16_t c2; 1499 if(src!=limit && U16_IS_TRAIL(c2=*src)) { 1500 ++src; 1501 c=U16_GET_SUPPLEMENTARY(c, c2); 1502 norm16=UCPTRIE_FAST_SUPP_GET(normTrie, UCPTRIE_16, c); 1503 if(!isCompYesAndZeroCC(norm16)) { 1504 break; 1505 } 1506 } 1507 } 1508 } 1509 } 1510 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1511 // The current character is either a "noNo" (has a mapping) 1512 // or a "maybeYes" / "maybeNo" (combines backward) 1513 // or a "yesYes" with ccc!=0. 1514 // It is not a Hangul syllable or Jamo L because those have "yes" properties. 1515 1516 // Medium-fast path: Handle cases that do not require full decomposition and recomposition. 1517 if (norm16 < minMaybeNo) { // minNoNo <= norm16 < minMaybeNo 1518 if (!doCompose) { 1519 return false; 1520 } 1521 // Fast path for mapping a character that is immediately surrounded by boundaries. 1522 // In this case, we need not decompose around the current character. 1523 if (isDecompNoAlgorithmic(norm16)) { 1524 // Maps to a single isCompYesAndZeroCC character 1525 // which also implies hasCompBoundaryBefore. 1526 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || 1527 hasCompBoundaryBefore(src, limit)) { 1528 if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) { 1529 break; 1530 } 1531 if(!buffer.append(mapAlgorithmic(c, norm16), 0, errorCode)) { 1532 break; 1533 } 1534 prevBoundary = src; 1535 continue; 1536 } 1537 } else if (norm16 < minNoNoCompBoundaryBefore) { 1538 // The mapping is comp-normalized which also implies hasCompBoundaryBefore. 1539 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || 1540 hasCompBoundaryBefore(src, limit)) { 1541 if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) { 1542 break; 1543 } 1544 const char16_t *mapping = reinterpret_cast<const char16_t *>(getDataForYesOrNo(norm16)); 1545 int32_t length = *mapping++ & MAPPING_LENGTH_MASK; 1546 if(!buffer.appendZeroCC(mapping, mapping + length, errorCode)) { 1547 break; 1548 } 1549 prevBoundary = src; 1550 continue; 1551 } 1552 } else if (norm16 >= minNoNoEmpty) { 1553 // The current character maps to nothing. 1554 // Simply omit it from the output if there is a boundary before _or_ after it. 1555 // The character itself implies no boundaries. 1556 if (hasCompBoundaryBefore(src, limit) || 1557 hasCompBoundaryAfter(prevBoundary, prevSrc, onlyContiguous)) { 1558 if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) { 1559 break; 1560 } 1561 prevBoundary = src; 1562 continue; 1563 } 1564 } 1565 // Other "noNo" type, or need to examine more text around this character: 1566 // Fall through to the slow path. 1567 } else if (isJamoVT(norm16) && prevBoundary != prevSrc) { 1568 char16_t prev=*(prevSrc-1); 1569 if(c<Hangul::JAMO_T_BASE) { 1570 // The current character is a Jamo Vowel, 1571 // compose with previous Jamo L and following Jamo T. 1572 char16_t l = static_cast<char16_t>(prev - Hangul::JAMO_L_BASE); 1573 if(l<Hangul::JAMO_L_COUNT) { 1574 if (!doCompose) { 1575 return false; 1576 } 1577 int32_t t; 1578 if (src != limit && 1579 0 < (t = (static_cast<int32_t>(*src) - Hangul::JAMO_T_BASE)) && 1580 t < Hangul::JAMO_T_COUNT) { 1581 // The next character is a Jamo T. 1582 ++src; 1583 } else if (hasCompBoundaryBefore(src, limit)) { 1584 // No Jamo T follows, not even via decomposition. 1585 t = 0; 1586 } else { 1587 t = -1; 1588 } 1589 if (t >= 0) { 1590 UChar32 syllable = Hangul::HANGUL_BASE + 1591 (l*Hangul::JAMO_V_COUNT + (c-Hangul::JAMO_V_BASE)) * 1592 Hangul::JAMO_T_COUNT + t; 1593 --prevSrc; // Replace the Jamo L as well. 1594 if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) { 1595 break; 1596 } 1597 if (!buffer.appendBMP(static_cast<char16_t>(syllable), 0, errorCode)) { 1598 break; 1599 } 1600 prevBoundary = src; 1601 continue; 1602 } 1603 // If we see L+V+x where x!=T then we drop to the slow path, 1604 // decompose and recompose. 1605 // This is to deal with NFKC finding normal L and V but a 1606 // compatibility variant of a T. 1607 // We need to either fully compose that combination here 1608 // (which would complicate the code and may not work with strange custom data) 1609 // or use the slow path. 1610 } 1611 } else if (Hangul::isHangulLV(prev)) { 1612 // The current character is a Jamo Trailing consonant, 1613 // compose with previous Hangul LV that does not contain a Jamo T. 1614 if (!doCompose) { 1615 return false; 1616 } 1617 UChar32 syllable = prev + c - Hangul::JAMO_T_BASE; 1618 --prevSrc; // Replace the Hangul LV as well. 1619 if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) { 1620 break; 1621 } 1622 if (!buffer.appendBMP(static_cast<char16_t>(syllable), 0, errorCode)) { 1623 break; 1624 } 1625 prevBoundary = src; 1626 continue; 1627 } 1628 // No matching context, or may need to decompose surrounding text first: 1629 // Fall through to the slow path. 1630 } else if (norm16 > JAMO_VT) { // norm16 >= MIN_YES_YES_WITH_CC 1631 // One or more combining marks that do not combine-back: 1632 // Check for canonical order, copy unchanged if ok and 1633 // if followed by a character with a boundary-before. 1634 uint8_t cc = getCCFromNormalYesOrMaybe(norm16); // cc!=0 1635 if (onlyContiguous /* FCC */ && getPreviousTrailCC(prevBoundary, prevSrc) > cc) { 1636 // Fails FCD test, need to decompose and contiguously recompose. 1637 if (!doCompose) { 1638 return false; 1639 } 1640 } else { 1641 // If !onlyContiguous (not FCC), then we ignore the tccc of 1642 // the previous character which passed the quick check "yes && ccc==0" test. 1643 const char16_t *nextSrc; 1644 uint16_t n16; 1645 for (;;) { 1646 if (src == limit) { 1647 if (doCompose) { 1648 buffer.appendZeroCC(prevBoundary, limit, errorCode); 1649 } 1650 return true; 1651 } 1652 uint8_t prevCC = cc; 1653 nextSrc = src; 1654 UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, c, n16); 1655 if (n16 >= MIN_YES_YES_WITH_CC) { 1656 cc = getCCFromNormalYesOrMaybe(n16); 1657 if (prevCC > cc) { 1658 if (!doCompose) { 1659 return false; 1660 } 1661 break; 1662 } 1663 } else { 1664 break; 1665 } 1666 src = nextSrc; 1667 } 1668 // src is after the last in-order combining mark. 1669 // If there is a boundary here, then we continue with no change. 1670 if (norm16HasCompBoundaryBefore(n16)) { 1671 if (isCompYesAndZeroCC(n16)) { 1672 src = nextSrc; 1673 } 1674 continue; 1675 } 1676 // Use the slow path. There is no boundary in [prevSrc, src[. 1677 } 1678 } 1679 1680 // Slow path: Find the nearest boundaries around the current character, 1681 // decompose and recompose. 1682 if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) { 1683 const char16_t *p = prevSrc; 1684 UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, prevBoundary, p, c, norm16); 1685 if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 1686 prevSrc = p; 1687 } 1688 } 1689 if (doCompose && prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) { 1690 break; 1691 } 1692 int32_t recomposeStartIndex=buffer.length(); 1693 // We know there is not a boundary here. 1694 decomposeShort(prevSrc, src, false /* !stopAtCompBoundary */, onlyContiguous, 1695 buffer, errorCode); 1696 // Decompose until the next boundary. 1697 src = decomposeShort(src, limit, true /* stopAtCompBoundary */, onlyContiguous, 1698 buffer, errorCode); 1699 if (U_FAILURE(errorCode)) { 1700 break; 1701 } 1702 if ((src - prevSrc) > INT32_MAX) { // guard before buffer.equals() 1703 errorCode = U_INDEX_OUTOFBOUNDS_ERROR; 1704 return true; 1705 } 1706 recompose(buffer, recomposeStartIndex, onlyContiguous); 1707 if(!doCompose) { 1708 if(!buffer.equals(prevSrc, src)) { 1709 return false; 1710 } 1711 buffer.remove(); 1712 } 1713 prevBoundary=src; 1714 } 1715 return true; 1716 } 1717 1718 // Very similar to compose(): Make the same changes in both places if relevant. 1719 // pQCResult==nullptr: spanQuickCheckYes 1720 // pQCResult!=nullptr: quickCheck (*pQCResult must be UNORM_YES) 1721 const char16_t * 1722 Normalizer2Impl::composeQuickCheck(const char16_t *src, const char16_t *limit, 1723 UBool onlyContiguous, 1724 UNormalizationCheckResult *pQCResult) const { 1725 const char16_t *prevBoundary=src; 1726 UChar32 minNoMaybeCP=minCompNoMaybeCP; 1727 if(limit==nullptr) { 1728 UErrorCode errorCode=U_ZERO_ERROR; 1729 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, nullptr, errorCode); 1730 limit=u_strchr(src, 0); 1731 if (prevBoundary != src) { 1732 if (hasCompBoundaryAfter(*(src-1), onlyContiguous)) { 1733 prevBoundary = src; 1734 } else { 1735 prevBoundary = --src; 1736 } 1737 } 1738 } 1739 1740 for(;;) { 1741 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, 1742 // or with (compYes && ccc==0) properties. 1743 const char16_t *prevSrc; 1744 UChar32 c = 0; 1745 uint16_t norm16 = 0; 1746 for (;;) { 1747 if(src==limit) { 1748 return src; 1749 } 1750 if( (c=*src)<minNoMaybeCP || 1751 isCompYesAndZeroCC(norm16=UCPTRIE_FAST_BMP_GET(normTrie, UCPTRIE_16, c)) 1752 ) { 1753 ++src; 1754 } else { 1755 prevSrc = src++; 1756 if(!U16_IS_LEAD(c)) { 1757 break; 1758 } else { 1759 char16_t c2; 1760 if(src!=limit && U16_IS_TRAIL(c2=*src)) { 1761 ++src; 1762 c=U16_GET_SUPPLEMENTARY(c, c2); 1763 norm16=UCPTRIE_FAST_SUPP_GET(normTrie, UCPTRIE_16, c); 1764 if(!isCompYesAndZeroCC(norm16)) { 1765 break; 1766 } 1767 } 1768 } 1769 } 1770 } 1771 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1772 // The current character is either a "noNo" (has a mapping) 1773 // or a "maybeYes" / "maybeNo" (combines backward) 1774 // or a "yesYes" with ccc!=0. 1775 // It is not a Hangul syllable or Jamo L because those have "yes" properties. 1776 1777 uint16_t prevNorm16 = INERT; 1778 if (prevBoundary != prevSrc) { 1779 if (norm16HasCompBoundaryBefore(norm16)) { 1780 prevBoundary = prevSrc; 1781 } else { 1782 const char16_t *p = prevSrc; 1783 uint16_t n16; 1784 UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, prevBoundary, p, c, n16); 1785 if (norm16HasCompBoundaryAfter(n16, onlyContiguous)) { 1786 prevBoundary = prevSrc; 1787 } else { 1788 prevBoundary = p; 1789 prevNorm16 = n16; 1790 } 1791 } 1792 } 1793 1794 if (norm16 >= minMaybeNo) { 1795 uint16_t fcd16 = getFCD16FromMaybeOrNonZeroCC(norm16); 1796 uint8_t cc = fcd16 >> 8; 1797 if (onlyContiguous /* FCC */ && cc != 0 && 1798 getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) { 1799 // The [prevBoundary..prevSrc[ character 1800 // passed the quick check "yes && ccc==0" test 1801 // but is out of canonical order with the current combining mark. 1802 } else { 1803 // If !onlyContiguous (not FCC), then we ignore the tccc of 1804 // the previous character which passed the quick check "yes && ccc==0" test. 1805 const char16_t *nextSrc; 1806 for (;;) { 1807 if (norm16 < MIN_YES_YES_WITH_CC) { 1808 if (pQCResult != nullptr) { 1809 *pQCResult = UNORM_MAYBE; 1810 } else { 1811 return prevBoundary; 1812 } 1813 } 1814 if (src == limit) { 1815 return src; 1816 } 1817 uint8_t prevCC = fcd16; 1818 nextSrc = src; 1819 UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, c, norm16); 1820 if (norm16 >= minMaybeNo) { 1821 fcd16 = getFCD16FromMaybeOrNonZeroCC(norm16); 1822 cc = fcd16 >> 8; 1823 if (!(prevCC <= cc || cc == 0)) { 1824 break; 1825 } 1826 } else { 1827 break; 1828 } 1829 src = nextSrc; 1830 } 1831 // src is after the last in-order combining mark. 1832 if (isCompYesAndZeroCC(norm16)) { 1833 prevBoundary = src; 1834 src = nextSrc; 1835 continue; 1836 } 1837 } 1838 } 1839 if(pQCResult!=nullptr) { 1840 *pQCResult=UNORM_NO; 1841 } 1842 return prevBoundary; 1843 } 1844 } 1845 1846 void Normalizer2Impl::composeAndAppend(const char16_t *src, const char16_t *limit, 1847 UBool doCompose, 1848 UBool onlyContiguous, 1849 UnicodeString &safeMiddle, 1850 ReorderingBuffer &buffer, 1851 UErrorCode &errorCode) const { 1852 if(!buffer.isEmpty()) { 1853 const char16_t *firstStarterInSrc=findNextCompBoundary(src, limit, onlyContiguous); 1854 if(src!=firstStarterInSrc) { 1855 const char16_t *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(), 1856 buffer.getLimit(), onlyContiguous); 1857 int32_t destSuffixLength = static_cast<int32_t>(buffer.getLimit() - lastStarterInDest); 1858 UnicodeString middle(lastStarterInDest, destSuffixLength); 1859 buffer.removeSuffix(destSuffixLength); 1860 safeMiddle=middle; 1861 middle.append(src, static_cast<int32_t>(firstStarterInSrc - src)); 1862 const char16_t *middleStart=middle.getBuffer(); 1863 compose(middleStart, middleStart+middle.length(), onlyContiguous, 1864 true, buffer, errorCode); 1865 if(U_FAILURE(errorCode)) { 1866 return; 1867 } 1868 src=firstStarterInSrc; 1869 } 1870 } 1871 if(doCompose) { 1872 compose(src, limit, onlyContiguous, true, buffer, errorCode); 1873 } else { 1874 if(limit==nullptr) { // appendZeroCC() needs limit!=nullptr 1875 limit=u_strchr(src, 0); 1876 } 1877 buffer.appendZeroCC(src, limit, errorCode); 1878 } 1879 } 1880 1881 UBool 1882 Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous, 1883 const uint8_t *src, const uint8_t *limit, 1884 ByteSink *sink, Edits *edits, UErrorCode &errorCode) const { 1885 U_ASSERT(limit != nullptr); 1886 UnicodeString s16; 1887 uint8_t minNoMaybeLead = leadByteForCP(minCompNoMaybeCP); 1888 const uint8_t *prevBoundary = src; 1889 1890 for (;;) { 1891 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, 1892 // or with (compYes && ccc==0) properties. 1893 const uint8_t *prevSrc; 1894 uint16_t norm16 = 0; 1895 for (;;) { 1896 if (src == limit) { 1897 if (prevBoundary != limit && sink != nullptr) { 1898 ByteSinkUtil::appendUnchanged(prevBoundary, limit, 1899 *sink, options, edits, errorCode); 1900 } 1901 return true; 1902 } 1903 if (*src < minNoMaybeLead) { 1904 ++src; 1905 } else { 1906 prevSrc = src; 1907 UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16); 1908 if (!isCompYesAndZeroCC(norm16)) { 1909 break; 1910 } 1911 } 1912 } 1913 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1914 // The current character is either a "noNo" (has a mapping) 1915 // or a "maybeYes" / "maybeNo" (combines backward) 1916 // or a "yesYes" with ccc!=0. 1917 // It is not a Hangul syllable or Jamo L because those have "yes" properties. 1918 1919 // Medium-fast path: Handle cases that do not require full decomposition and recomposition. 1920 if (norm16 < minMaybeNo) { // minNoNo <= norm16 < minMaybeNo 1921 if (sink == nullptr) { 1922 return false; 1923 } 1924 // Fast path for mapping a character that is immediately surrounded by boundaries. 1925 // In this case, we need not decompose around the current character. 1926 if (isDecompNoAlgorithmic(norm16)) { 1927 // Maps to a single isCompYesAndZeroCC character 1928 // which also implies hasCompBoundaryBefore. 1929 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || 1930 hasCompBoundaryBefore(src, limit)) { 1931 if (prevBoundary != prevSrc && 1932 !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc, 1933 *sink, options, edits, errorCode)) { 1934 break; 1935 } 1936 appendCodePointDelta(prevSrc, src, getAlgorithmicDelta(norm16), *sink, edits); 1937 prevBoundary = src; 1938 continue; 1939 } 1940 } else if (norm16 < minNoNoCompBoundaryBefore) { 1941 // The mapping is comp-normalized which also implies hasCompBoundaryBefore. 1942 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || 1943 hasCompBoundaryBefore(src, limit)) { 1944 if (prevBoundary != prevSrc && 1945 !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc, 1946 *sink, options, edits, errorCode)) { 1947 break; 1948 } 1949 const uint16_t *mapping = getDataForYesOrNo(norm16); 1950 int32_t length = *mapping++ & MAPPING_LENGTH_MASK; 1951 if (!ByteSinkUtil::appendChange(prevSrc, src, reinterpret_cast<const char16_t*>(mapping), length, 1952 *sink, edits, errorCode)) { 1953 break; 1954 } 1955 prevBoundary = src; 1956 continue; 1957 } 1958 } else if (norm16 >= minNoNoEmpty) { 1959 // The current character maps to nothing. 1960 // Simply omit it from the output if there is a boundary before _or_ after it. 1961 // The character itself implies no boundaries. 1962 if (hasCompBoundaryBefore(src, limit) || 1963 hasCompBoundaryAfter(prevBoundary, prevSrc, onlyContiguous)) { 1964 if (prevBoundary != prevSrc && 1965 !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc, 1966 *sink, options, edits, errorCode)) { 1967 break; 1968 } 1969 if (edits != nullptr) { 1970 edits->addReplace(static_cast<int32_t>(src - prevSrc), 0); 1971 } 1972 prevBoundary = src; 1973 continue; 1974 } 1975 } 1976 // Other "noNo" type, or need to examine more text around this character: 1977 // Fall through to the slow path. 1978 } else if (isJamoVT(norm16)) { 1979 // Jamo L: E1 84 80..92 1980 // Jamo V: E1 85 A1..B5 1981 // Jamo T: E1 86 A8..E1 87 82 1982 U_ASSERT((src - prevSrc) == 3 && *prevSrc == 0xe1); 1983 UChar32 prev = previousHangulOrJamo(prevBoundary, prevSrc); 1984 if (prevSrc[1] == 0x85) { 1985 // The current character is a Jamo Vowel, 1986 // compose with previous Jamo L and following Jamo T. 1987 UChar32 l = prev - Hangul::JAMO_L_BASE; 1988 if (static_cast<uint32_t>(l) < Hangul::JAMO_L_COUNT) { 1989 if (sink == nullptr) { 1990 return false; 1991 } 1992 int32_t t = getJamoTMinusBase(src, limit); 1993 if (t >= 0) { 1994 // The next character is a Jamo T. 1995 src += 3; 1996 } else if (hasCompBoundaryBefore(src, limit)) { 1997 // No Jamo T follows, not even via decomposition. 1998 t = 0; 1999 } 2000 if (t >= 0) { 2001 UChar32 syllable = Hangul::HANGUL_BASE + 2002 (l*Hangul::JAMO_V_COUNT + (prevSrc[2]-0xa1)) * 2003 Hangul::JAMO_T_COUNT + t; 2004 prevSrc -= 3; // Replace the Jamo L as well. 2005 if (prevBoundary != prevSrc && 2006 !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc, 2007 *sink, options, edits, errorCode)) { 2008 break; 2009 } 2010 ByteSinkUtil::appendCodePoint(prevSrc, src, syllable, *sink, edits); 2011 prevBoundary = src; 2012 continue; 2013 } 2014 // If we see L+V+x where x!=T then we drop to the slow path, 2015 // decompose and recompose. 2016 // This is to deal with NFKC finding normal L and V but a 2017 // compatibility variant of a T. 2018 // We need to either fully compose that combination here 2019 // (which would complicate the code and may not work with strange custom data) 2020 // or use the slow path. 2021 } 2022 } else if (Hangul::isHangulLV(prev)) { 2023 // The current character is a Jamo Trailing consonant, 2024 // compose with previous Hangul LV that does not contain a Jamo T. 2025 if (sink == nullptr) { 2026 return false; 2027 } 2028 UChar32 syllable = prev + getJamoTMinusBase(prevSrc, src); 2029 prevSrc -= 3; // Replace the Hangul LV as well. 2030 if (prevBoundary != prevSrc && 2031 !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc, 2032 *sink, options, edits, errorCode)) { 2033 break; 2034 } 2035 ByteSinkUtil::appendCodePoint(prevSrc, src, syllable, *sink, edits); 2036 prevBoundary = src; 2037 continue; 2038 } 2039 // No matching context, or may need to decompose surrounding text first: 2040 // Fall through to the slow path. 2041 } else if (norm16 > JAMO_VT) { // norm16 >= MIN_YES_YES_WITH_CC 2042 // One or more combining marks that do not combine-back: 2043 // Check for canonical order, copy unchanged if ok and 2044 // if followed by a character with a boundary-before. 2045 uint8_t cc = getCCFromNormalYesOrMaybe(norm16); // cc!=0 2046 if (onlyContiguous /* FCC */ && getPreviousTrailCC(prevBoundary, prevSrc) > cc) { 2047 // Fails FCD test, need to decompose and contiguously recompose. 2048 if (sink == nullptr) { 2049 return false; 2050 } 2051 } else { 2052 // If !onlyContiguous (not FCC), then we ignore the tccc of 2053 // the previous character which passed the quick check "yes && ccc==0" test. 2054 const uint8_t *nextSrc; 2055 uint16_t n16; 2056 for (;;) { 2057 if (src == limit) { 2058 if (sink != nullptr) { 2059 ByteSinkUtil::appendUnchanged(prevBoundary, limit, 2060 *sink, options, edits, errorCode); 2061 } 2062 return true; 2063 } 2064 uint8_t prevCC = cc; 2065 nextSrc = src; 2066 UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, n16); 2067 if (n16 >= MIN_YES_YES_WITH_CC) { 2068 cc = getCCFromNormalYesOrMaybe(n16); 2069 if (prevCC > cc) { 2070 if (sink == nullptr) { 2071 return false; 2072 } 2073 break; 2074 } 2075 } else { 2076 break; 2077 } 2078 src = nextSrc; 2079 } 2080 // src is after the last in-order combining mark. 2081 // If there is a boundary here, then we continue with no change. 2082 if (norm16HasCompBoundaryBefore(n16)) { 2083 if (isCompYesAndZeroCC(n16)) { 2084 src = nextSrc; 2085 } 2086 continue; 2087 } 2088 // Use the slow path. There is no boundary in [prevSrc, src[. 2089 } 2090 } 2091 2092 // Slow path: Find the nearest boundaries around the current character, 2093 // decompose and recompose. 2094 if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) { 2095 const uint8_t *p = prevSrc; 2096 UCPTRIE_FAST_U8_PREV(normTrie, UCPTRIE_16, prevBoundary, p, norm16); 2097 if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 2098 prevSrc = p; 2099 } 2100 } 2101 ReorderingBuffer buffer(*this, s16, errorCode); 2102 if (U_FAILURE(errorCode)) { 2103 break; 2104 } 2105 // We know there is not a boundary here. 2106 decomposeShort(prevSrc, src, STOP_AT_LIMIT, onlyContiguous, 2107 buffer, errorCode); 2108 // Decompose until the next boundary. 2109 src = decomposeShort(src, limit, STOP_AT_COMP_BOUNDARY, onlyContiguous, 2110 buffer, errorCode); 2111 if (U_FAILURE(errorCode)) { 2112 break; 2113 } 2114 if ((src - prevSrc) > INT32_MAX) { // guard before buffer.equals() 2115 errorCode = U_INDEX_OUTOFBOUNDS_ERROR; 2116 return true; 2117 } 2118 recompose(buffer, 0, onlyContiguous); 2119 if (!buffer.equals(prevSrc, src)) { 2120 if (sink == nullptr) { 2121 return false; 2122 } 2123 if (prevBoundary != prevSrc && 2124 !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc, 2125 *sink, options, edits, errorCode)) { 2126 break; 2127 } 2128 if (!ByteSinkUtil::appendChange(prevSrc, src, buffer.getStart(), buffer.length(), 2129 *sink, edits, errorCode)) { 2130 break; 2131 } 2132 prevBoundary = src; 2133 } 2134 } 2135 return true; 2136 } 2137 2138 UBool Normalizer2Impl::hasCompBoundaryBefore(const char16_t *src, const char16_t *limit) const { 2139 if (src == limit || *src < minCompNoMaybeCP) { 2140 return true; 2141 } 2142 UChar32 c; 2143 uint16_t norm16; 2144 UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, src, limit, c, norm16); 2145 return norm16HasCompBoundaryBefore(norm16); 2146 } 2147 2148 UBool Normalizer2Impl::hasCompBoundaryBefore(const uint8_t *src, const uint8_t *limit) const { 2149 if (src == limit) { 2150 return true; 2151 } 2152 uint16_t norm16; 2153 UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16); 2154 return norm16HasCompBoundaryBefore(norm16); 2155 } 2156 2157 UBool Normalizer2Impl::hasCompBoundaryAfter(const char16_t *start, const char16_t *p, 2158 UBool onlyContiguous) const { 2159 if (start == p) { 2160 return true; 2161 } 2162 UChar32 c; 2163 uint16_t norm16; 2164 UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, start, p, c, norm16); 2165 return norm16HasCompBoundaryAfter(norm16, onlyContiguous); 2166 } 2167 2168 UBool Normalizer2Impl::hasCompBoundaryAfter(const uint8_t *start, const uint8_t *p, 2169 UBool onlyContiguous) const { 2170 if (start == p) { 2171 return true; 2172 } 2173 uint16_t norm16; 2174 UCPTRIE_FAST_U8_PREV(normTrie, UCPTRIE_16, start, p, norm16); 2175 return norm16HasCompBoundaryAfter(norm16, onlyContiguous); 2176 } 2177 2178 const char16_t *Normalizer2Impl::findPreviousCompBoundary(const char16_t *start, const char16_t *p, 2179 UBool onlyContiguous) const { 2180 while (p != start) { 2181 const char16_t *codePointLimit = p; 2182 UChar32 c; 2183 uint16_t norm16; 2184 UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, start, p, c, norm16); 2185 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 2186 return codePointLimit; 2187 } 2188 if (hasCompBoundaryBefore(c, norm16)) { 2189 return p; 2190 } 2191 } 2192 return p; 2193 } 2194 2195 const char16_t *Normalizer2Impl::findNextCompBoundary(const char16_t *p, const char16_t *limit, 2196 UBool onlyContiguous) const { 2197 while (p != limit) { 2198 const char16_t *codePointStart = p; 2199 UChar32 c; 2200 uint16_t norm16; 2201 UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16); 2202 if (hasCompBoundaryBefore(c, norm16)) { 2203 return codePointStart; 2204 } 2205 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 2206 return p; 2207 } 2208 } 2209 return p; 2210 } 2211 2212 uint8_t Normalizer2Impl::getPreviousTrailCC(const char16_t *start, const char16_t *p) const { 2213 if (start == p) { 2214 return 0; 2215 } 2216 int32_t i = static_cast<int32_t>(p - start); 2217 UChar32 c; 2218 U16_PREV(start, 0, i, c); 2219 return static_cast<uint8_t>(getFCD16(c)); 2220 } 2221 2222 uint8_t Normalizer2Impl::getPreviousTrailCC(const uint8_t *start, const uint8_t *p) const { 2223 if (start == p) { 2224 return 0; 2225 } 2226 int32_t i = static_cast<int32_t>(p - start); 2227 UChar32 c; 2228 U8_PREV(start, 0, i, c); 2229 return static_cast<uint8_t>(getFCD16(c)); 2230 } 2231 2232 // Note: normalizer2impl.cpp r30982 (2011-nov-27) 2233 // still had getFCDTrie() which built and cached an FCD trie. 2234 // That provided faster access to FCD data than getFCD16FromNormData() 2235 // but required synchronization and consumed some 10kB of heap memory 2236 // in any process that uses FCD (e.g., via collation). 2237 // minDecompNoCP etc. and smallFCD[] are intended to help with any loss of performance, 2238 // at least for ASCII & CJK. 2239 2240 // Ticket 20907 - The optimizer in MSVC/Visual Studio versions below 16.4 has trouble with this 2241 // function on Windows ARM64. As a work-around, we disable optimizations for this function. 2242 // This work-around could/should be removed once the following versions of Visual Studio are no 2243 // longer supported: All versions of VS2017, and versions of VS2019 below 16.4. 2244 #if (defined(_MSC_VER) && (defined(_M_ARM64)) && (_MSC_VER < 1924)) 2245 #pragma optimize( "", off ) 2246 #endif 2247 // Gets the FCD value from the regular normalization data. 2248 uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const { 2249 uint16_t norm16=getNorm16(c); 2250 if (norm16 >= limitNoNo) { 2251 if(norm16>=MIN_NORMAL_MAYBE_YES) { 2252 // combining mark 2253 norm16=getCCFromNormalYesOrMaybe(norm16); 2254 return norm16|(norm16<<8); 2255 } else if(norm16>=minMaybeYes) { 2256 return 0; 2257 } else if(norm16<minMaybeNo) { // isDecompNoAlgorithmic(norm16) 2258 uint16_t deltaTrailCC = norm16 & DELTA_TCCC_MASK; 2259 if (deltaTrailCC <= DELTA_TCCC_1) { 2260 return deltaTrailCC >> OFFSET_SHIFT; 2261 } 2262 // Maps to an isCompYesAndZeroCC. 2263 c=mapAlgorithmic(c, norm16); 2264 norm16=getRawNorm16(c); 2265 } 2266 } 2267 if(norm16<=minYesNo || isHangulLVT(norm16)) { 2268 // no decomposition or Hangul syllable, all zeros 2269 return 0; 2270 } 2271 // c decomposes, get everything from the variable-length extra data 2272 const uint16_t *mapping=getData(norm16); 2273 uint16_t firstUnit=*mapping; 2274 norm16=firstUnit>>8; // tccc 2275 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { 2276 norm16|=*(mapping-1)&0xff00; // lccc 2277 } 2278 return norm16; 2279 } 2280 #if (defined(_MSC_VER) && (defined(_M_ARM64)) && (_MSC_VER < 1924)) 2281 #pragma optimize( "", on ) 2282 #endif 2283 2284 uint16_t Normalizer2Impl::getFCD16FromMaybeOrNonZeroCC(uint16_t norm16) const { 2285 U_ASSERT(norm16 >= minMaybeNo); 2286 if (norm16 >= MIN_NORMAL_MAYBE_YES) { 2287 // combining mark 2288 norm16 = getCCFromNormalYesOrMaybe(norm16); 2289 return norm16 | (norm16<<8); 2290 } else if (norm16 >= minMaybeYes) { 2291 return 0; 2292 } 2293 // c decomposes, get everything from the variable-length extra data 2294 const uint16_t *mapping = getDataForMaybe(norm16); 2295 uint16_t firstUnit = *mapping; 2296 // maybeNo has lccc = 0 2297 U_ASSERT((firstUnit & MAPPING_HAS_CCC_LCCC_WORD) == 0 || (*(mapping - 1) & 0xff00) == 0); 2298 return firstUnit >> 8; // tccc 2299 } 2300 2301 // Dual functionality: 2302 // buffer!=nullptr: normalize 2303 // buffer==nullptr: isNormalized/quickCheck/spanQuickCheckYes 2304 const char16_t * 2305 Normalizer2Impl::makeFCD(const char16_t *src, const char16_t *limit, 2306 ReorderingBuffer *buffer, 2307 UErrorCode &errorCode) const { 2308 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. 2309 // Similar to the prevBoundary in the compose() implementation. 2310 const char16_t *prevBoundary=src; 2311 int32_t prevFCD16=0; 2312 if(limit==nullptr) { 2313 src=copyLowPrefixFromNulTerminated(src, minLcccCP, buffer, errorCode); 2314 if(U_FAILURE(errorCode)) { 2315 return src; 2316 } 2317 if(prevBoundary<src) { 2318 prevBoundary=src; 2319 // We know that the previous character's lccc==0. 2320 // Fetching the fcd16 value was deferred for this below-U+0300 code point. 2321 prevFCD16=getFCD16(*(src-1)); 2322 if(prevFCD16>1) { 2323 --prevBoundary; 2324 } 2325 } 2326 limit=u_strchr(src, 0); 2327 } 2328 2329 // Note: In this function we use buffer->appendZeroCC() because we track 2330 // the lead and trail combining classes here, rather than leaving it to 2331 // the ReorderingBuffer. 2332 // The exception is the call to decomposeShort() which uses the buffer 2333 // in the normal way. 2334 2335 const char16_t *prevSrc; 2336 UChar32 c=0; 2337 uint16_t fcd16=0; 2338 2339 for(;;) { 2340 // count code units with lccc==0 2341 for(prevSrc=src; src!=limit;) { 2342 if((c=*src)<minLcccCP) { 2343 prevFCD16=~c; 2344 ++src; 2345 } else if(!singleLeadMightHaveNonZeroFCD16(c)) { 2346 prevFCD16=0; 2347 ++src; 2348 } else { 2349 if(U16_IS_LEAD(c)) { 2350 char16_t c2; 2351 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 2352 c=U16_GET_SUPPLEMENTARY(c, c2); 2353 } 2354 } 2355 if((fcd16=getFCD16FromNormData(c))<=0xff) { 2356 prevFCD16=fcd16; 2357 src+=U16_LENGTH(c); 2358 } else { 2359 break; 2360 } 2361 } 2362 } 2363 // copy these code units all at once 2364 if(src!=prevSrc) { 2365 if(buffer!=nullptr && !buffer->appendZeroCC(prevSrc, src, errorCode)) { 2366 break; 2367 } 2368 if(src==limit) { 2369 break; 2370 } 2371 prevBoundary=src; 2372 // We know that the previous character's lccc==0. 2373 if(prevFCD16<0) { 2374 // Fetching the fcd16 value was deferred for this below-minLcccCP code point. 2375 UChar32 prev=~prevFCD16; 2376 if(prev<minDecompNoCP) { 2377 prevFCD16=0; 2378 } else { 2379 prevFCD16=getFCD16FromNormData(prev); 2380 if(prevFCD16>1) { 2381 --prevBoundary; 2382 } 2383 } 2384 } else { 2385 const char16_t *p=src-1; 2386 if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) { 2387 --p; 2388 // Need to fetch the previous character's FCD value because 2389 // prevFCD16 was just for the trail surrogate code point. 2390 prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1])); 2391 // Still known to have lccc==0 because its lead surrogate unit had lccc==0. 2392 } 2393 if(prevFCD16>1) { 2394 prevBoundary=p; 2395 } 2396 } 2397 // The start of the current character (c). 2398 prevSrc=src; 2399 } else if(src==limit) { 2400 break; 2401 } 2402 2403 src+=U16_LENGTH(c); 2404 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. 2405 // Check for proper order, and decompose locally if necessary. 2406 if((prevFCD16&0xff)<=(fcd16>>8)) { 2407 // proper order: prev tccc <= current lccc 2408 if((fcd16&0xff)<=1) { 2409 prevBoundary=src; 2410 } 2411 if(buffer!=nullptr && !buffer->appendZeroCC(c, errorCode)) { 2412 break; 2413 } 2414 prevFCD16=fcd16; 2415 continue; 2416 } else if(buffer==nullptr) { 2417 return prevBoundary; // quick check "no" 2418 } else { 2419 /* 2420 * Back out the part of the source that we copied or appended 2421 * already but is now going to be decomposed. 2422 * prevSrc is set to after what was copied/appended. 2423 */ 2424 buffer->removeSuffix(static_cast<int32_t>(prevSrc - prevBoundary)); 2425 /* 2426 * Find the part of the source that needs to be decomposed, 2427 * up to the next safe boundary. 2428 */ 2429 src=findNextFCDBoundary(src, limit); 2430 /* 2431 * The source text does not fulfill the conditions for FCD. 2432 * Decompose and reorder a limited piece of the text. 2433 */ 2434 decomposeShort(prevBoundary, src, false, false, *buffer, errorCode); 2435 if (U_FAILURE(errorCode)) { 2436 break; 2437 } 2438 prevBoundary=src; 2439 prevFCD16=0; 2440 } 2441 } 2442 return src; 2443 } 2444 2445 void Normalizer2Impl::makeFCDAndAppend(const char16_t *src, const char16_t *limit, 2446 UBool doMakeFCD, 2447 UnicodeString &safeMiddle, 2448 ReorderingBuffer &buffer, 2449 UErrorCode &errorCode) const { 2450 if(!buffer.isEmpty()) { 2451 const char16_t *firstBoundaryInSrc=findNextFCDBoundary(src, limit); 2452 if(src!=firstBoundaryInSrc) { 2453 const char16_t *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(), 2454 buffer.getLimit()); 2455 int32_t destSuffixLength = static_cast<int32_t>(buffer.getLimit() - lastBoundaryInDest); 2456 UnicodeString middle(lastBoundaryInDest, destSuffixLength); 2457 buffer.removeSuffix(destSuffixLength); 2458 safeMiddle=middle; 2459 middle.append(src, static_cast<int32_t>(firstBoundaryInSrc - src)); 2460 const char16_t *middleStart=middle.getBuffer(); 2461 makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode); 2462 if(U_FAILURE(errorCode)) { 2463 return; 2464 } 2465 src=firstBoundaryInSrc; 2466 } 2467 } 2468 if(doMakeFCD) { 2469 makeFCD(src, limit, &buffer, errorCode); 2470 } else { 2471 if(limit==nullptr) { // appendZeroCC() needs limit!=nullptr 2472 limit=u_strchr(src, 0); 2473 } 2474 buffer.appendZeroCC(src, limit, errorCode); 2475 } 2476 } 2477 2478 const char16_t *Normalizer2Impl::findPreviousFCDBoundary(const char16_t *start, const char16_t *p) const { 2479 while(start<p) { 2480 const char16_t *codePointLimit = p; 2481 UChar32 c; 2482 uint16_t norm16; 2483 UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, start, p, c, norm16); 2484 if (c < minDecompNoCP || norm16HasDecompBoundaryAfter(norm16)) { 2485 return codePointLimit; 2486 } 2487 if (norm16HasDecompBoundaryBefore(norm16)) { 2488 return p; 2489 } 2490 } 2491 return p; 2492 } 2493 2494 const char16_t *Normalizer2Impl::findNextFCDBoundary(const char16_t *p, const char16_t *limit) const { 2495 while(p<limit) { 2496 const char16_t *codePointStart=p; 2497 UChar32 c; 2498 uint16_t norm16; 2499 UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16); 2500 if (c < minLcccCP || norm16HasDecompBoundaryBefore(norm16)) { 2501 return codePointStart; 2502 } 2503 if (norm16HasDecompBoundaryAfter(norm16)) { 2504 return p; 2505 } 2506 } 2507 return p; 2508 } 2509 2510 // CanonicalIterator data -------------------------------------------------- *** 2511 2512 CanonIterData::CanonIterData(UErrorCode &errorCode) : 2513 mutableTrie(umutablecptrie_open(0, 0, &errorCode)), trie(nullptr), 2514 canonStartSets(uprv_deleteUObject, nullptr, errorCode) {} 2515 2516 CanonIterData::~CanonIterData() { 2517 umutablecptrie_close(mutableTrie); 2518 ucptrie_close(trie); 2519 } 2520 2521 void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) { 2522 uint32_t canonValue = umutablecptrie_get(mutableTrie, decompLead); 2523 if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) { 2524 // origin is the first character whose decomposition starts with 2525 // the character for which we are setting the value. 2526 umutablecptrie_set(mutableTrie, decompLead, canonValue|origin, &errorCode); 2527 } else { 2528 // origin is not the first character, or it is U+0000. 2529 UnicodeSet *set; 2530 if((canonValue&CANON_HAS_SET)==0) { 2531 LocalPointer<UnicodeSet> lpSet(new UnicodeSet, errorCode); 2532 set=lpSet.getAlias(); 2533 if(U_FAILURE(errorCode)) { 2534 return; 2535 } 2536 UChar32 firstOrigin = static_cast<UChar32>(canonValue & CANON_VALUE_MASK); 2537 canonValue = (canonValue & ~CANON_VALUE_MASK) | CANON_HAS_SET | static_cast<uint32_t>(canonStartSets.size()); 2538 umutablecptrie_set(mutableTrie, decompLead, canonValue, &errorCode); 2539 canonStartSets.adoptElement(lpSet.orphan(), errorCode); 2540 if (U_FAILURE(errorCode)) { 2541 return; 2542 } 2543 if(firstOrigin!=0) { 2544 set->add(firstOrigin); 2545 } 2546 } else { 2547 set = static_cast<UnicodeSet*>(canonStartSets[static_cast<int32_t>(canonValue & CANON_VALUE_MASK)]); 2548 } 2549 set->add(origin); 2550 } 2551 } 2552 2553 // C++ class for friend access to private Normalizer2Impl members. 2554 class InitCanonIterData { 2555 public: 2556 static void doInit(Normalizer2Impl *impl, UErrorCode &errorCode); 2557 }; 2558 2559 U_CDECL_BEGIN 2560 2561 // UInitOnce instantiation function for CanonIterData 2562 static void U_CALLCONV 2563 initCanonIterData(Normalizer2Impl *impl, UErrorCode &errorCode) { 2564 InitCanonIterData::doInit(impl, errorCode); 2565 } 2566 2567 U_CDECL_END 2568 2569 void InitCanonIterData::doInit(Normalizer2Impl *impl, UErrorCode &errorCode) { 2570 U_ASSERT(impl->fCanonIterData == nullptr); 2571 impl->fCanonIterData = new CanonIterData(errorCode); 2572 if (impl->fCanonIterData == nullptr) { 2573 errorCode=U_MEMORY_ALLOCATION_ERROR; 2574 } 2575 if (U_SUCCESS(errorCode)) { 2576 UChar32 start = 0, end; 2577 uint32_t value; 2578 while ((end = ucptrie_getRange(impl->normTrie, start, 2579 UCPMAP_RANGE_FIXED_LEAD_SURROGATES, Normalizer2Impl::INERT, 2580 nullptr, nullptr, &value)) >= 0) { 2581 // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters. 2582 if (value != Normalizer2Impl::INERT) { 2583 impl->makeCanonIterDataFromNorm16(start, end, value, *impl->fCanonIterData, errorCode); 2584 } 2585 start = end + 1; 2586 } 2587 #ifdef UCPTRIE_DEBUG 2588 umutablecptrie_setName(impl->fCanonIterData->mutableTrie, "CanonIterData"); 2589 #endif 2590 impl->fCanonIterData->trie = umutablecptrie_buildImmutable( 2591 impl->fCanonIterData->mutableTrie, UCPTRIE_TYPE_SMALL, UCPTRIE_VALUE_BITS_32, &errorCode); 2592 umutablecptrie_close(impl->fCanonIterData->mutableTrie); 2593 impl->fCanonIterData->mutableTrie = nullptr; 2594 } 2595 if (U_FAILURE(errorCode)) { 2596 delete impl->fCanonIterData; 2597 impl->fCanonIterData = nullptr; 2598 } 2599 } 2600 2601 void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, const uint16_t norm16, 2602 CanonIterData &newData, 2603 UErrorCode &errorCode) const { 2604 if(isInert(norm16) || 2605 (minYesNo<=norm16 && norm16<minNoNo) || 2606 (minMaybeNo<=norm16 && norm16<minMaybeYes)) { 2607 // Inert, or 2-way mapping (including Hangul syllable). 2608 // We do not write a canonStartSet for any yesNo/maybeNo character. 2609 // Composites from 2-way mappings are added at runtime from the 2610 // starter's compositions list, and the other characters in 2611 // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are 2612 // "maybe" characters. 2613 return; 2614 } 2615 for(UChar32 c=start; c<=end; ++c) { 2616 uint32_t oldValue = umutablecptrie_get(newData.mutableTrie, c); 2617 uint32_t newValue=oldValue; 2618 if(isMaybeYesOrNonZeroCC(norm16)) { 2619 // not a segment starter if it occurs in a decomposition or has cc!=0 2620 newValue|=CANON_NOT_SEGMENT_STARTER; 2621 if(norm16<MIN_NORMAL_MAYBE_YES) { 2622 newValue|=CANON_HAS_COMPOSITIONS; 2623 } 2624 } else if(norm16<minYesNo) { 2625 newValue|=CANON_HAS_COMPOSITIONS; 2626 } else { 2627 // c has a one-way decomposition 2628 UChar32 c2=c; 2629 // Do not modify the whole-range norm16 value. 2630 uint16_t norm16_2=norm16; 2631 if (isDecompNoAlgorithmic(norm16_2)) { 2632 // Maps to an isCompYesAndZeroCC. 2633 c2 = mapAlgorithmic(c2, norm16_2); 2634 norm16_2 = getRawNorm16(c2); 2635 // No compatibility mappings for the CanonicalIterator. 2636 U_ASSERT(!(isHangulLV(norm16_2) || isHangulLVT(norm16_2))); 2637 } 2638 if (norm16_2 > minYesNo) { 2639 // c decomposes, get everything from the variable-length extra data 2640 const uint16_t *mapping=getDataForYesOrNo(norm16_2); 2641 uint16_t firstUnit=*mapping; 2642 int32_t length=firstUnit&MAPPING_LENGTH_MASK; 2643 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 2644 if(c==c2 && (*(mapping-1)&0xff)!=0) { 2645 newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0 2646 } 2647 } 2648 // Skip empty mappings (no characters in the decomposition). 2649 if(length!=0) { 2650 ++mapping; // skip over the firstUnit 2651 // add c to first code point's start set 2652 int32_t i=0; 2653 U16_NEXT_UNSAFE(mapping, i, c2); 2654 newData.addToStartSet(c, c2, errorCode); 2655 // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a 2656 // one-way mapping. A 2-way mapping is possible here after 2657 // intermediate algorithmic mapping. 2658 if(norm16_2>=minNoNo) { 2659 while(i<length) { 2660 U16_NEXT_UNSAFE(mapping, i, c2); 2661 uint32_t c2Value = umutablecptrie_get(newData.mutableTrie, c2); 2662 if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) { 2663 umutablecptrie_set(newData.mutableTrie, c2, 2664 c2Value|CANON_NOT_SEGMENT_STARTER, &errorCode); 2665 } 2666 } 2667 } 2668 } 2669 } else { 2670 // c decomposed to c2 algorithmically; c has cc==0 2671 newData.addToStartSet(c, c2, errorCode); 2672 } 2673 } 2674 if(newValue!=oldValue) { 2675 umutablecptrie_set(newData.mutableTrie, c, newValue, &errorCode); 2676 } 2677 } 2678 } 2679 2680 UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const { 2681 // Logically const: Synchronized instantiation. 2682 Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this); 2683 umtx_initOnce(me->fCanonIterDataInitOnce, &initCanonIterData, me, errorCode); 2684 return U_SUCCESS(errorCode); 2685 } 2686 2687 int32_t Normalizer2Impl::getCanonValue(UChar32 c) const { 2688 return static_cast<int32_t>(ucptrie_get(fCanonIterData->trie, c)); 2689 } 2690 2691 const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const { 2692 return *static_cast<const UnicodeSet*>(fCanonIterData->canonStartSets[n]); 2693 } 2694 2695 UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const { 2696 return getCanonValue(c)>=0; 2697 } 2698 2699 UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const { 2700 int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER; 2701 if(canonValue==0) { 2702 return false; 2703 } 2704 set.clear(); 2705 int32_t value=canonValue&CANON_VALUE_MASK; 2706 if((canonValue&CANON_HAS_SET)!=0) { 2707 set.addAll(getCanonStartSet(value)); 2708 } else if(value!=0) { 2709 set.add(value); 2710 } 2711 if((canonValue&CANON_HAS_COMPOSITIONS)!=0) { 2712 uint16_t norm16=getRawNorm16(c); 2713 if(norm16==JAMO_L) { 2714 UChar32 syllable= 2715 static_cast<UChar32>(Hangul::HANGUL_BASE + (c - Hangul::JAMO_L_BASE) * Hangul::JAMO_VT_COUNT); 2716 set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1); 2717 } else { 2718 addComposites(getCompositionsList(norm16), set); 2719 } 2720 } 2721 return true; 2722 } 2723 2724 U_NAMESPACE_END 2725 2726 // Normalizer2 data swapping ----------------------------------------------- *** 2727 2728 U_NAMESPACE_USE 2729 2730 U_CAPI int32_t U_EXPORT2 2731 unorm2_swap(const UDataSwapper *ds, 2732 const void *inData, int32_t length, void *outData, 2733 UErrorCode *pErrorCode) { 2734 const UDataInfo *pInfo; 2735 int32_t headerSize; 2736 2737 const uint8_t *inBytes; 2738 uint8_t *outBytes; 2739 2740 const int32_t *inIndexes; 2741 int32_t indexes[Normalizer2Impl::IX_TOTAL_SIZE+1]; 2742 2743 int32_t i, offset, nextOffset, size; 2744 2745 /* udata_swapDataHeader checks the arguments */ 2746 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); 2747 if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { 2748 return 0; 2749 } 2750 2751 /* check data format and format version */ 2752 pInfo=(const UDataInfo *)((const char *)inData+4); 2753 uint8_t formatVersion0=pInfo->formatVersion[0]; 2754 if(!( 2755 pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */ 2756 pInfo->dataFormat[1]==0x72 && 2757 pInfo->dataFormat[2]==0x6d && 2758 pInfo->dataFormat[3]==0x32 && 2759 (1<=formatVersion0 && formatVersion0<=5) 2760 )) { 2761 udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n", 2762 pInfo->dataFormat[0], pInfo->dataFormat[1], 2763 pInfo->dataFormat[2], pInfo->dataFormat[3], 2764 pInfo->formatVersion[0]); 2765 *pErrorCode=U_UNSUPPORTED_ERROR; 2766 return 0; 2767 } 2768 2769 inBytes=(const uint8_t *)inData+headerSize; 2770 outBytes=(outData == nullptr) ? nullptr : (uint8_t *)outData+headerSize; 2771 2772 inIndexes=(const int32_t *)inBytes; 2773 int32_t minIndexesLength; 2774 if(formatVersion0==1) { 2775 minIndexesLength=Normalizer2Impl::IX_MIN_MAYBE_YES+1; 2776 } else if(formatVersion0==2) { 2777 minIndexesLength=Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY+1; 2778 } else if(formatVersion0<=4) { 2779 minIndexesLength=Normalizer2Impl::IX_MIN_LCCC_CP+1; 2780 } else { 2781 minIndexesLength=Normalizer2Impl::IX_MIN_MAYBE_NO_COMBINES_FWD+1; 2782 } 2783 2784 if(length>=0) { 2785 length-=headerSize; 2786 if(length<minIndexesLength*4) { 2787 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n", 2788 length); 2789 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 2790 return 0; 2791 } 2792 } 2793 2794 /* read the first few indexes */ 2795 for(i=0; i<UPRV_LENGTHOF(indexes); ++i) { 2796 indexes[i]=udata_readInt32(ds, inIndexes[i]); 2797 } 2798 2799 /* get the total length of the data */ 2800 size=indexes[Normalizer2Impl::IX_TOTAL_SIZE]; 2801 2802 if(length>=0) { 2803 if(length<size) { 2804 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n", 2805 length); 2806 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 2807 return 0; 2808 } 2809 2810 /* copy the data for inaccessible bytes */ 2811 if(inBytes!=outBytes) { 2812 uprv_memcpy(outBytes, inBytes, size); 2813 } 2814 2815 offset=0; 2816 2817 /* swap the int32_t indexes[] */ 2818 nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]; 2819 ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode); 2820 offset=nextOffset; 2821 2822 /* swap the trie */ 2823 nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]; 2824 utrie_swapAnyVersion(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode); 2825 offset=nextOffset; 2826 2827 /* swap the uint16_t extraData[] */ 2828 nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]; 2829 ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode); 2830 offset=nextOffset; 2831 2832 /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */ 2833 nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1]; 2834 offset=nextOffset; 2835 2836 U_ASSERT(offset==size); 2837 } 2838 2839 return headerSize+size; 2840 } 2841 2842 #endif // !UCONFIG_NO_NORMALIZATION