ucasemap.cpp (37937B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2005-2016, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: ucasemap.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2005may06 16 * created by: Markus W. Scherer 17 * 18 * Case mapping service object and functions using it. 19 */ 20 21 #include "unicode/utypes.h" 22 #include "unicode/brkiter.h" 23 #include "unicode/bytestream.h" 24 #include "unicode/casemap.h" 25 #include "unicode/edits.h" 26 #include "unicode/stringoptions.h" 27 #include "unicode/stringpiece.h" 28 #include "unicode/ubrk.h" 29 #include "unicode/uloc.h" 30 #include "unicode/ustring.h" 31 #include "unicode/ucasemap.h" 32 #if !UCONFIG_NO_BREAK_ITERATION 33 #include "unicode/utext.h" 34 #endif 35 #include "unicode/utf.h" 36 #include "unicode/utf8.h" 37 #include "unicode/utf16.h" 38 #include "bytesinkutil.h" 39 #include "cmemory.h" 40 #include "cstring.h" 41 #include "uassert.h" 42 #include "ucase.h" 43 #include "ucasemap_imp.h" 44 45 U_NAMESPACE_USE 46 47 /* UCaseMap service object -------------------------------------------------- */ 48 49 UCaseMap::UCaseMap(const char *localeID, uint32_t opts, UErrorCode *pErrorCode) : 50 #if !UCONFIG_NO_BREAK_ITERATION 51 iter(nullptr), 52 #endif 53 caseLocale(UCASE_LOC_UNKNOWN), options(opts) { 54 ucasemap_setLocale(this, localeID, pErrorCode); 55 } 56 57 UCaseMap::~UCaseMap() { 58 #if !UCONFIG_NO_BREAK_ITERATION 59 delete iter; 60 #endif 61 } 62 63 U_CAPI UCaseMap * U_EXPORT2 64 ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) { 65 if(U_FAILURE(*pErrorCode)) { 66 return nullptr; 67 } 68 UCaseMap *csm = new UCaseMap(locale, options, pErrorCode); 69 if(csm==nullptr) { 70 *pErrorCode = U_MEMORY_ALLOCATION_ERROR; 71 return nullptr; 72 } else if (U_FAILURE(*pErrorCode)) { 73 delete csm; 74 return nullptr; 75 } 76 return csm; 77 } 78 79 U_CAPI void U_EXPORT2 80 ucasemap_close(UCaseMap *csm) { 81 delete csm; 82 } 83 84 U_CAPI const char * U_EXPORT2 85 ucasemap_getLocale(const UCaseMap *csm) { 86 return csm->locale; 87 } 88 89 U_CAPI uint32_t U_EXPORT2 90 ucasemap_getOptions(const UCaseMap *csm) { 91 return csm->options; 92 } 93 94 U_CAPI void U_EXPORT2 95 ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) { 96 if(U_FAILURE(*pErrorCode)) { 97 return; 98 } 99 if (locale != nullptr && *locale == 0) { 100 csm->locale[0] = 0; 101 csm->caseLocale = UCASE_LOC_ROOT; 102 return; 103 } 104 105 UErrorCode bufferStatus = U_ZERO_ERROR; 106 int32_t length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), &bufferStatus); 107 if(bufferStatus==U_BUFFER_OVERFLOW_ERROR || (U_SUCCESS(bufferStatus) && length==sizeof(csm->locale))) { 108 bufferStatus = U_ZERO_ERROR; 109 /* we only really need the language code for case mappings */ 110 length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), &bufferStatus); 111 } 112 if(U_FAILURE(bufferStatus)) { 113 *pErrorCode=bufferStatus; 114 } else if(length==sizeof(csm->locale)) { 115 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 116 } 117 if(U_SUCCESS(*pErrorCode)) { 118 csm->caseLocale = ucase_getCaseLocale(csm->locale); 119 } else { 120 csm->locale[0]=0; 121 csm->caseLocale = UCASE_LOC_ROOT; 122 } 123 } 124 125 U_CAPI void U_EXPORT2 126 ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode) { 127 if(U_FAILURE(*pErrorCode)) { 128 return; 129 } 130 csm->options=options; 131 } 132 133 /* UTF-8 string case mappings ----------------------------------------------- */ 134 135 /* TODO(markus): Move to a new, separate utf8case.cpp file. */ 136 137 namespace { 138 139 /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */ 140 inline UBool 141 appendResult(int32_t cpLength, int32_t result, const char16_t *s, 142 ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) { 143 U_ASSERT(U_SUCCESS(errorCode)); 144 145 /* decode the result */ 146 if(result<0) { 147 /* (not) original code point */ 148 if(edits!=nullptr) { 149 edits->addUnchanged(cpLength); 150 } 151 if((options & U_OMIT_UNCHANGED_TEXT) == 0) { 152 ByteSinkUtil::appendCodePoint(cpLength, ~result, sink); 153 } 154 } else { 155 if(result<=UCASE_MAX_STRING_LENGTH) { 156 // string: "result" is the UTF-16 length 157 return ByteSinkUtil::appendChange(cpLength, s, result, sink, edits, errorCode); 158 } else { 159 ByteSinkUtil::appendCodePoint(cpLength, result, sink, edits); 160 } 161 } 162 return true; 163 } 164 165 // See unicode/utf8.h U8_APPEND_UNSAFE(). 166 inline uint8_t getTwoByteLead(UChar32 c) { return static_cast<uint8_t>((c >> 6) | 0xc0); } 167 inline uint8_t getTwoByteTrail(UChar32 c) { return static_cast<uint8_t>((c & 0x3f) | 0x80); } 168 169 UChar32 U_CALLCONV 170 utf8_caseContextIterator(void *context, int8_t dir) { 171 UCaseContext* csc = static_cast<UCaseContext*>(context); 172 UChar32 c; 173 174 if(dir<0) { 175 /* reset for backward iteration */ 176 csc->index=csc->cpStart; 177 csc->dir=dir; 178 } else if(dir>0) { 179 /* reset for forward iteration */ 180 csc->index=csc->cpLimit; 181 csc->dir=dir; 182 } else { 183 /* continue current iteration direction */ 184 dir=csc->dir; 185 } 186 187 if(dir<0) { 188 if(csc->start<csc->index) { 189 U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c); 190 return c; 191 } 192 } else { 193 if(csc->index<csc->limit) { 194 U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c); 195 return c; 196 } 197 } 198 return U_SENTINEL; 199 } 200 201 /** 202 * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account. 203 * caseLocale < 0: Case-folds [srcStart..srcLimit[. 204 */ 205 void toLower(int32_t caseLocale, uint32_t options, 206 const uint8_t *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit, 207 icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) { 208 const int8_t *latinToLower; 209 if (caseLocale == UCASE_LOC_ROOT || 210 (caseLocale >= 0 ? 211 !(caseLocale == UCASE_LOC_TURKISH || caseLocale == UCASE_LOC_LITHUANIAN) : 212 (options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) { 213 latinToLower = LatinCase::TO_LOWER_NORMAL; 214 } else { 215 latinToLower = LatinCase::TO_LOWER_TR_LT; 216 } 217 const UTrie2 *trie = ucase_getTrie(); 218 int32_t prev = srcStart; 219 int32_t srcIndex = srcStart; 220 for (;;) { 221 // fast path for simple cases 222 int32_t cpStart; 223 UChar32 c; 224 for (;;) { 225 if (U_FAILURE(errorCode) || srcIndex >= srcLimit) { 226 c = U_SENTINEL; 227 break; 228 } 229 uint8_t lead = src[srcIndex++]; 230 if (lead <= 0x7f) { 231 int8_t d = latinToLower[lead]; 232 if (d == LatinCase::EXC) { 233 cpStart = srcIndex - 1; 234 c = lead; 235 break; 236 } 237 if (d == 0) { continue; } 238 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev, 239 sink, options, edits, errorCode); 240 char ascii = static_cast<char>(lead + d); 241 sink.Append(&ascii, 1); 242 if (edits != nullptr) { 243 edits->addReplace(1, 1); 244 } 245 prev = srcIndex; 246 continue; 247 } else if (lead < 0xe3) { 248 uint8_t t; 249 if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLimit && 250 (t = src[srcIndex] - 0x80) <= 0x3f) { 251 // U+0080..U+017F 252 ++srcIndex; 253 c = ((lead - 0xc0) << 6) | t; 254 int8_t d = latinToLower[c]; 255 if (d == LatinCase::EXC) { 256 cpStart = srcIndex - 2; 257 break; 258 } 259 if (d == 0) { continue; } 260 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev, 261 sink, options, edits, errorCode); 262 ByteSinkUtil::appendTwoBytes(c + d, sink); 263 if (edits != nullptr) { 264 edits->addReplace(2, 2); 265 } 266 prev = srcIndex; 267 continue; 268 } 269 } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) && 270 (srcIndex + 2) <= srcLimit && 271 U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) { 272 // most of CJK: no case mappings 273 srcIndex += 2; 274 continue; 275 } 276 cpStart = --srcIndex; 277 U8_NEXT(src, srcIndex, srcLimit, c); 278 if (c < 0) { 279 // ill-formed UTF-8 280 continue; 281 } 282 uint16_t props = UTRIE2_GET16(trie, c); 283 if (UCASE_HAS_EXCEPTION(props)) { break; } 284 int32_t delta; 285 if (!UCASE_IS_UPPER_OR_TITLE(props) || (delta = UCASE_GET_DELTA(props)) == 0) { 286 continue; 287 } 288 ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev, 289 sink, options, edits, errorCode); 290 ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits); 291 prev = srcIndex; 292 } 293 if (c < 0) { 294 break; 295 } 296 // slow path 297 const char16_t *s; 298 if (caseLocale >= 0) { 299 csc->cpStart = cpStart; 300 csc->cpLimit = srcIndex; 301 c = ucase_toFullLower(c, utf8_caseContextIterator, csc, &s, caseLocale); 302 } else { 303 c = ucase_toFullFolding(c, &s, options); 304 } 305 if (c >= 0) { 306 ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev, 307 sink, options, edits, errorCode); 308 appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode); 309 prev = srcIndex; 310 } 311 } 312 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev, 313 sink, options, edits, errorCode); 314 } 315 316 void toUpper(int32_t caseLocale, uint32_t options, 317 const uint8_t *src, UCaseContext *csc, int32_t srcLength, 318 icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) { 319 const int8_t *latinToUpper; 320 if (caseLocale == UCASE_LOC_TURKISH) { 321 latinToUpper = LatinCase::TO_UPPER_TR; 322 } else { 323 latinToUpper = LatinCase::TO_UPPER_NORMAL; 324 } 325 const UTrie2 *trie = ucase_getTrie(); 326 int32_t prev = 0; 327 int32_t srcIndex = 0; 328 for (;;) { 329 // fast path for simple cases 330 int32_t cpStart; 331 UChar32 c; 332 for (;;) { 333 if (U_FAILURE(errorCode) || srcIndex >= srcLength) { 334 c = U_SENTINEL; 335 break; 336 } 337 uint8_t lead = src[srcIndex++]; 338 if (lead <= 0x7f) { 339 int8_t d = latinToUpper[lead]; 340 if (d == LatinCase::EXC) { 341 cpStart = srcIndex - 1; 342 c = lead; 343 break; 344 } 345 if (d == 0) { continue; } 346 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev, 347 sink, options, edits, errorCode); 348 char ascii = static_cast<char>(lead + d); 349 sink.Append(&ascii, 1); 350 if (edits != nullptr) { 351 edits->addReplace(1, 1); 352 } 353 prev = srcIndex; 354 continue; 355 } else if (lead < 0xe3) { 356 uint8_t t; 357 if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLength && 358 (t = src[srcIndex] - 0x80) <= 0x3f) { 359 // U+0080..U+017F 360 ++srcIndex; 361 c = ((lead - 0xc0) << 6) | t; 362 int8_t d = latinToUpper[c]; 363 if (d == LatinCase::EXC) { 364 cpStart = srcIndex - 2; 365 break; 366 } 367 if (d == 0) { continue; } 368 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev, 369 sink, options, edits, errorCode); 370 ByteSinkUtil::appendTwoBytes(c + d, sink); 371 if (edits != nullptr) { 372 edits->addReplace(2, 2); 373 } 374 prev = srcIndex; 375 continue; 376 } 377 } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) && 378 (srcIndex + 2) <= srcLength && 379 U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) { 380 // most of CJK: no case mappings 381 srcIndex += 2; 382 continue; 383 } 384 cpStart = --srcIndex; 385 U8_NEXT(src, srcIndex, srcLength, c); 386 if (c < 0) { 387 // ill-formed UTF-8 388 continue; 389 } 390 uint16_t props = UTRIE2_GET16(trie, c); 391 if (UCASE_HAS_EXCEPTION(props)) { break; } 392 int32_t delta; 393 if (UCASE_GET_TYPE(props) != UCASE_LOWER || (delta = UCASE_GET_DELTA(props)) == 0) { 394 continue; 395 } 396 ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev, 397 sink, options, edits, errorCode); 398 ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits); 399 prev = srcIndex; 400 } 401 if (c < 0) { 402 break; 403 } 404 // slow path 405 csc->cpStart = cpStart; 406 csc->cpLimit = srcIndex; 407 const char16_t *s; 408 c = ucase_toFullUpper(c, utf8_caseContextIterator, csc, &s, caseLocale); 409 if (c >= 0) { 410 ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev, 411 sink, options, edits, errorCode); 412 appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode); 413 prev = srcIndex; 414 } 415 } 416 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev, 417 sink, options, edits, errorCode); 418 } 419 420 } // namespace 421 422 #if !UCONFIG_NO_BREAK_ITERATION 423 424 namespace { 425 426 constexpr uint8_t ACUTE_BYTE0 = u8"\u0301"[0]; 427 428 constexpr uint8_t ACUTE_BYTE1 = u8"\u0301"[1]; 429 430 /** 431 * Input: c is a letter I with or without acute accent. 432 * start is the index in src after c, and is less than segmentLimit. 433 * If a plain i/I is followed by a plain j/J, 434 * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute, 435 * then we output accordingly. 436 * 437 * @return the src index after the titlecased sequence, or the start index if no Dutch IJ 438 */ 439 int32_t maybeTitleDutchIJ(const uint8_t *src, UChar32 c, int32_t start, int32_t segmentLimit, 440 ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) { 441 U_ASSERT(start < segmentLimit); 442 443 int32_t index = start; 444 bool withAcute = false; 445 446 // If the conditions are met, then the following variables tell us what to output. 447 int32_t unchanged1 = 0; // code units before the j, or the whole sequence (0..3) 448 bool doTitleJ = false; // true if the j needs to be titlecased 449 int32_t unchanged2 = 0; // after the j (0 or 1) 450 451 // next character after the first letter 452 UChar32 c2; 453 c2 = src[index++]; 454 455 // Is the first letter an i/I with accent? 456 if (c == u'I') { 457 if (c2 == ACUTE_BYTE0 && index < segmentLimit && src[index++] == ACUTE_BYTE1) { 458 withAcute = true; 459 unchanged1 = 2; // ACUTE is 2 code units in UTF-8 460 if (index == segmentLimit) { return start; } 461 c2 = src[index++]; 462 } 463 } else { // Í 464 withAcute = true; 465 } 466 467 // Is the next character a j/J? 468 if (c2 == u'j') { 469 doTitleJ = true; 470 } else if (c2 == u'J') { 471 ++unchanged1; 472 } else { 473 return start; 474 } 475 476 // A plain i/I must be followed by a plain j/J. 477 // An i/I with acute must be followed by a j/J with acute. 478 if (withAcute) { 479 if ((index + 1) >= segmentLimit || src[index++] != ACUTE_BYTE0 || src[index++] != ACUTE_BYTE1) { 480 return start; 481 } 482 if (doTitleJ) { 483 unchanged2 = 2; // ACUTE is 2 code units in UTF-8 484 } else { 485 unchanged1 = unchanged1 + 2; // ACUTE is 2 code units in UTF-8 486 } 487 } 488 489 // There must not be another combining mark. 490 if (index < segmentLimit) { 491 int32_t cp; 492 int32_t i = index; 493 U8_NEXT(src, i, segmentLimit, cp); 494 uint32_t typeMask = U_GET_GC_MASK(cp); 495 if ((typeMask & U_GC_M_MASK) != 0) { 496 return start; 497 } 498 } 499 500 // Output the rest of the Dutch IJ. 501 ByteSinkUtil::appendUnchanged(src + start, unchanged1, sink, options, edits, errorCode); 502 start += unchanged1; 503 if (doTitleJ) { 504 ByteSinkUtil::appendCodePoint(1, u'J', sink, edits); 505 ++start; 506 } 507 ByteSinkUtil::appendUnchanged(src + start, unchanged2, sink, options, edits, errorCode); 508 509 U_ASSERT(start + unchanged2 == index); 510 return index; 511 } 512 513 } // namespace 514 515 U_CFUNC void U_CALLCONV 516 ucasemap_internalUTF8ToTitle( 517 int32_t caseLocale, uint32_t options, BreakIterator *iter, 518 const uint8_t *src, int32_t srcLength, 519 ByteSink &sink, icu::Edits *edits, 520 UErrorCode &errorCode) { 521 if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) { 522 return; 523 } 524 525 /* set up local variables */ 526 UCaseContext csc=UCASECONTEXT_INITIALIZER; 527 csc.p=(void *)src; 528 csc.limit=srcLength; 529 int32_t prev=0; 530 UBool isFirstIndex=true; 531 532 /* titlecasing loop */ 533 while(prev<srcLength) { 534 /* find next index where to titlecase */ 535 int32_t index; 536 if(isFirstIndex) { 537 isFirstIndex=false; 538 index=iter->first(); 539 } else { 540 index=iter->next(); 541 } 542 if(index==UBRK_DONE || index>srcLength) { 543 index=srcLength; 544 } 545 546 /* 547 * Segment [prev..index[ into 3 parts: 548 * a) skipped characters (copy as-is) [prev..titleStart[ 549 * b) first letter (titlecase) [titleStart..titleLimit[ 550 * c) subsequent characters (lowercase) [titleLimit..index[ 551 */ 552 if(prev<index) { 553 /* find and copy skipped characters [prev..titleStart[ */ 554 int32_t titleStart=prev; 555 int32_t titleLimit=prev; 556 UChar32 c; 557 U8_NEXT(src, titleLimit, index, c); 558 if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) { 559 // Adjust the titlecasing index to the next cased character, 560 // or to the next letter/number/symbol/private use. 561 // Stop with titleStart<titleLimit<=index 562 // if there is a character to be titlecased, 563 // or else stop with titleStart==titleLimit==index. 564 UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0; 565 while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) { 566 titleStart=titleLimit; 567 if(titleLimit==index) { 568 break; 569 } 570 U8_NEXT(src, titleLimit, index, c); 571 } 572 if (prev < titleStart) { 573 if (!ByteSinkUtil::appendUnchanged(src+prev, titleStart-prev, 574 sink, options, edits, errorCode)) { 575 return; 576 } 577 } 578 } 579 580 if(titleStart<titleLimit) { 581 /* titlecase c which is from [titleStart..titleLimit[ */ 582 if(c>=0) { 583 csc.cpStart=titleStart; 584 csc.cpLimit=titleLimit; 585 const char16_t *s; 586 c=ucase_toFullTitle(c, utf8_caseContextIterator, &csc, &s, caseLocale); 587 if (!appendResult(titleLimit-titleStart, c, s, sink, options, edits, errorCode)) { 588 return; 589 } 590 } else { 591 // Malformed UTF-8. 592 if (!ByteSinkUtil::appendUnchanged(src+titleStart, titleLimit-titleStart, 593 sink, options, edits, errorCode)) { 594 return; 595 } 596 } 597 598 /* Special case Dutch IJ titlecasing */ 599 if (titleLimit < index && 600 caseLocale == UCASE_LOC_DUTCH) { 601 if (c < 0) { 602 c = ~c; 603 } 604 605 if (c == u'I' || c == u'Í') { 606 titleLimit = maybeTitleDutchIJ(src, c, titleLimit, index, sink, options, edits, errorCode); 607 } 608 } 609 610 /* lowercase [titleLimit..index[ */ 611 if(titleLimit<index) { 612 if((options&U_TITLECASE_NO_LOWERCASE)==0) { 613 /* Normal operation: Lowercase the rest of the word. */ 614 toLower(caseLocale, options, 615 src, &csc, titleLimit, index, 616 sink, edits, errorCode); 617 if(U_FAILURE(errorCode)) { 618 return; 619 } 620 } else { 621 /* Optionally just copy the rest of the word unchanged. */ 622 if (!ByteSinkUtil::appendUnchanged(src+titleLimit, index-titleLimit, 623 sink, options, edits, errorCode)) { 624 return; 625 } 626 } 627 } 628 } 629 } 630 631 prev=index; 632 } 633 } 634 635 #endif 636 637 U_NAMESPACE_BEGIN 638 namespace GreekUpper { 639 640 UBool isFollowedByCasedLetter(const uint8_t *s, int32_t i, int32_t length) { 641 while (i < length) { 642 UChar32 c; 643 U8_NEXT(s, i, length, c); 644 int32_t type = ucase_getTypeOrIgnorable(c); 645 if ((type & UCASE_IGNORABLE) != 0) { 646 // Case-ignorable, continue with the loop. 647 } else if (type != UCASE_NONE) { 648 return true; // Followed by cased letter. 649 } else { 650 return false; // Uncased and not case-ignorable. 651 } 652 } 653 return false; // Not followed by cased letter. 654 } 655 656 // Keep this consistent with the UTF-16 version in ustrcase.cpp and the Java version in CaseMap.java. 657 void toUpper(uint32_t options, 658 const uint8_t *src, int32_t srcLength, 659 ByteSink &sink, Edits *edits, 660 UErrorCode &errorCode) { 661 uint32_t state = 0; 662 for (int32_t i = 0; i < srcLength;) { 663 int32_t nextIndex = i; 664 UChar32 c; 665 U8_NEXT(src, nextIndex, srcLength, c); 666 uint32_t nextState = 0; 667 int32_t type = ucase_getTypeOrIgnorable(c); 668 if ((type & UCASE_IGNORABLE) != 0) { 669 // c is case-ignorable 670 nextState |= (state & AFTER_CASED); 671 } else if (type != UCASE_NONE) { 672 // c is cased 673 nextState |= AFTER_CASED; 674 } 675 uint32_t data = getLetterData(c); 676 if (data > 0) { 677 uint32_t upper = data & UPPER_MASK; 678 // Add a dialytika to this iota or ypsilon vowel 679 // if we removed a tonos from the previous vowel, 680 // and that previous vowel did not also have (or gain) a dialytika. 681 // Adding one only to the final vowel in a longer sequence 682 // (which does not occur in normal writing) would require lookahead. 683 // Set the same flag as for preserving an existing dialytika. 684 if ((data & HAS_VOWEL) != 0 && 685 (state & (AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT | AFTER_VOWEL_WITH_COMBINING_ACCENT)) != 686 0 && 687 (upper == 0x399 || upper == 0x3A5)) { 688 data |= (state & AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT) != 0 ? HAS_DIALYTIKA 689 : HAS_COMBINING_DIALYTIKA; 690 } 691 int32_t numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota. 692 if ((data & HAS_YPOGEGRAMMENI) != 0) { 693 numYpogegrammeni = 1; 694 } 695 const UBool hasPrecomposedAccent = (data & HAS_ACCENT) != 0; 696 // Skip combining diacritics after this Greek letter. 697 int32_t nextNextIndex = nextIndex; 698 while (nextIndex < srcLength) { 699 UChar32 c2; 700 U8_NEXT(src, nextNextIndex, srcLength, c2); 701 uint32_t diacriticData = getDiacriticData(c2); 702 if (diacriticData != 0) { 703 data |= diacriticData; 704 if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) { 705 ++numYpogegrammeni; 706 } 707 nextIndex = nextNextIndex; 708 } else { 709 break; // not a Greek diacritic 710 } 711 } 712 if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) { 713 nextState |= hasPrecomposedAccent ? AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT 714 : AFTER_VOWEL_WITH_COMBINING_ACCENT; 715 } 716 // Map according to Greek rules. 717 UBool addTonos = false; 718 if (upper == 0x397 && 719 (data & HAS_ACCENT) != 0 && 720 numYpogegrammeni == 0 && 721 (state & AFTER_CASED) == 0 && 722 !isFollowedByCasedLetter(src, nextIndex, srcLength)) { 723 // Keep disjunctive "or" with (only) a tonos. 724 // We use the same "word boundary" conditions as for the Final_Sigma test. 725 if (hasPrecomposedAccent) { 726 upper = 0x389; // Preserve the precomposed form. 727 } else { 728 addTonos = true; 729 } 730 } else if ((data & HAS_DIALYTIKA) != 0) { 731 // Preserve a vowel with dialytika in precomposed form if it exists. 732 if (upper == 0x399) { 733 upper = 0x3AA; 734 data &= ~HAS_EITHER_DIALYTIKA; 735 } else if (upper == 0x3A5) { 736 upper = 0x3AB; 737 data &= ~HAS_EITHER_DIALYTIKA; 738 } 739 } 740 741 UBool change; 742 if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) { 743 change = true; // common, simple usage 744 } else { 745 // Find out first whether we are changing the text. 746 U_ASSERT(0x370 <= upper && upper <= 0x3ff); // 2-byte UTF-8, main Greek block 747 change = (i + 2) > nextIndex || 748 src[i] != getTwoByteLead(upper) || src[i + 1] != getTwoByteTrail(upper) || 749 numYpogegrammeni > 0; 750 int32_t i2 = i + 2; 751 if ((data & HAS_EITHER_DIALYTIKA) != 0) { 752 change |= (i2 + 2) > nextIndex || 753 src[i2] != static_cast<uint8_t>(u8"\u0308"[0]) || 754 src[i2 + 1] != static_cast<uint8_t>(u8"\u0308"[1]); 755 i2 += 2; 756 } 757 if (addTonos) { 758 change |= (i2 + 2) > nextIndex || 759 src[i2] != static_cast<uint8_t>(u8"\u0301"[0]) || 760 src[i2 + 1] != static_cast<uint8_t>(u8"\u0301"[1]); 761 i2 += 2; 762 } 763 int32_t oldLength = nextIndex - i; 764 int32_t newLength = (i2 - i) + numYpogegrammeni * 2; // 2 bytes per U+0399 765 change |= oldLength != newLength; 766 if (change) { 767 if (edits != nullptr) { 768 edits->addReplace(oldLength, newLength); 769 } 770 } else { 771 if (edits != nullptr) { 772 edits->addUnchanged(oldLength); 773 } 774 // Write unchanged text? 775 change = (options & U_OMIT_UNCHANGED_TEXT) == 0; 776 } 777 } 778 779 if (change) { 780 ByteSinkUtil::appendTwoBytes(upper, sink); 781 if ((data & HAS_EITHER_DIALYTIKA) != 0) { 782 sink.AppendU8(u8"\u0308", 2); // restore or add a dialytika 783 } 784 if (addTonos) { 785 sink.AppendU8(u8"\u0301", 2); 786 } 787 while (numYpogegrammeni > 0) { 788 sink.AppendU8(u8"\u0399", 2); 789 --numYpogegrammeni; 790 } 791 } 792 } else if(c>=0) { 793 const char16_t *s; 794 c=ucase_toFullUpper(c, nullptr, nullptr, &s, UCASE_LOC_GREEK); 795 if (!appendResult(nextIndex - i, c, s, sink, options, edits, errorCode)) { 796 return; 797 } 798 } else { 799 // Malformed UTF-8. 800 if (!ByteSinkUtil::appendUnchanged(src+i, nextIndex-i, 801 sink, options, edits, errorCode)) { 802 return; 803 } 804 } 805 i = nextIndex; 806 state = nextState; 807 } 808 } 809 810 } // namespace GreekUpper 811 U_NAMESPACE_END 812 813 static void U_CALLCONV 814 ucasemap_internalUTF8ToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED 815 const uint8_t *src, int32_t srcLength, 816 icu::ByteSink &sink, icu::Edits *edits, 817 UErrorCode &errorCode) { 818 UCaseContext csc=UCASECONTEXT_INITIALIZER; 819 csc.p=(void *)src; 820 csc.limit=srcLength; 821 toLower( 822 caseLocale, options, 823 src, &csc, 0, srcLength, 824 sink, edits, errorCode); 825 } 826 827 static void U_CALLCONV 828 ucasemap_internalUTF8ToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED 829 const uint8_t *src, int32_t srcLength, 830 icu::ByteSink &sink, icu::Edits *edits, 831 UErrorCode &errorCode) { 832 if (caseLocale == UCASE_LOC_GREEK) { 833 GreekUpper::toUpper(options, src, srcLength, sink, edits, errorCode); 834 } else { 835 UCaseContext csc=UCASECONTEXT_INITIALIZER; 836 csc.p=(void *)src; 837 csc.limit=srcLength; 838 toUpper( 839 caseLocale, options, 840 src, &csc, srcLength, 841 sink, edits, errorCode); 842 } 843 } 844 845 static void U_CALLCONV 846 ucasemap_internalUTF8Fold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED 847 const uint8_t *src, int32_t srcLength, 848 icu::ByteSink &sink, icu::Edits *edits, 849 UErrorCode &errorCode) { 850 toLower( 851 -1, options, 852 src, nullptr, 0, srcLength, 853 sink, edits, errorCode); 854 } 855 856 void 857 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM 858 const char *src, int32_t srcLength, 859 UTF8CaseMapper *stringCaseMapper, 860 icu::ByteSink &sink, icu::Edits *edits, 861 UErrorCode &errorCode) { 862 /* check argument values */ 863 if (U_FAILURE(errorCode)) { 864 return; 865 } 866 if ((src == nullptr && srcLength != 0) || srcLength < -1) { 867 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 868 return; 869 } 870 871 // Get the string length. 872 if (srcLength == -1) { 873 srcLength = static_cast<int32_t>(uprv_strlen(src)); 874 } 875 876 if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) { 877 edits->reset(); 878 } 879 stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR 880 reinterpret_cast<const uint8_t*>(src), srcLength, sink, edits, errorCode); 881 sink.Flush(); 882 if (U_SUCCESS(errorCode)) { 883 if (edits != nullptr) { 884 edits->copyErrorTo(errorCode); 885 } 886 } 887 } 888 889 int32_t 890 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM 891 char *dest, int32_t destCapacity, 892 const char *src, int32_t srcLength, 893 UTF8CaseMapper *stringCaseMapper, 894 icu::Edits *edits, 895 UErrorCode &errorCode) { 896 /* check argument values */ 897 if(U_FAILURE(errorCode)) { 898 return 0; 899 } 900 if( destCapacity<0 || 901 (dest==nullptr && destCapacity>0) || 902 (src==nullptr && srcLength!=0) || srcLength<-1 903 ) { 904 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 905 return 0; 906 } 907 908 /* get the string length */ 909 if(srcLength==-1) { 910 srcLength = static_cast<int32_t>(uprv_strlen(src)); 911 } 912 913 /* check for overlapping source and destination */ 914 if( dest!=nullptr && 915 ((src>=dest && src<(dest+destCapacity)) || 916 (dest>=src && dest<(src+srcLength))) 917 ) { 918 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 919 return 0; 920 } 921 922 if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) { 923 edits->reset(); 924 } 925 int32_t reslen = ByteSinkUtil::viaByteSinkToTerminatedChars( 926 dest, destCapacity, 927 [&](ByteSink& sink, UErrorCode& status) { 928 stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR 929 reinterpret_cast<const uint8_t*>(src), srcLength, sink, edits, status); 930 }, 931 errorCode); 932 if (U_SUCCESS(errorCode) && edits != nullptr) { 933 edits->copyErrorTo(errorCode); 934 } 935 return reslen; 936 } 937 938 /* public API functions */ 939 940 U_CAPI int32_t U_EXPORT2 941 ucasemap_utf8ToLower(const UCaseMap *csm, 942 char *dest, int32_t destCapacity, 943 const char *src, int32_t srcLength, 944 UErrorCode *pErrorCode) { 945 return ucasemap_mapUTF8( 946 csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL 947 dest, destCapacity, 948 src, srcLength, 949 ucasemap_internalUTF8ToLower, nullptr, *pErrorCode); 950 } 951 952 U_CAPI int32_t U_EXPORT2 953 ucasemap_utf8ToUpper(const UCaseMap *csm, 954 char *dest, int32_t destCapacity, 955 const char *src, int32_t srcLength, 956 UErrorCode *pErrorCode) { 957 return ucasemap_mapUTF8( 958 csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL 959 dest, destCapacity, 960 src, srcLength, 961 ucasemap_internalUTF8ToUpper, nullptr, *pErrorCode); 962 } 963 964 U_CAPI int32_t U_EXPORT2 965 ucasemap_utf8FoldCase(const UCaseMap *csm, 966 char *dest, int32_t destCapacity, 967 const char *src, int32_t srcLength, 968 UErrorCode *pErrorCode) { 969 return ucasemap_mapUTF8( 970 UCASE_LOC_ROOT, csm->options, UCASEMAP_BREAK_ITERATOR_NULL 971 dest, destCapacity, 972 src, srcLength, 973 ucasemap_internalUTF8Fold, nullptr, *pErrorCode); 974 } 975 976 U_NAMESPACE_BEGIN 977 978 void CaseMap::utf8ToLower( 979 const char *locale, uint32_t options, 980 StringPiece src, ByteSink &sink, Edits *edits, 981 UErrorCode &errorCode) { 982 ucasemap_mapUTF8( 983 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL 984 src.data(), src.length(), 985 ucasemap_internalUTF8ToLower, sink, edits, errorCode); 986 } 987 988 void CaseMap::utf8ToUpper( 989 const char *locale, uint32_t options, 990 StringPiece src, ByteSink &sink, Edits *edits, 991 UErrorCode &errorCode) { 992 ucasemap_mapUTF8( 993 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL 994 src.data(), src.length(), 995 ucasemap_internalUTF8ToUpper, sink, edits, errorCode); 996 } 997 998 void CaseMap::utf8Fold( 999 uint32_t options, 1000 StringPiece src, ByteSink &sink, Edits *edits, 1001 UErrorCode &errorCode) { 1002 ucasemap_mapUTF8( 1003 UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL 1004 src.data(), src.length(), 1005 ucasemap_internalUTF8Fold, sink, edits, errorCode); 1006 } 1007 1008 int32_t CaseMap::utf8ToLower( 1009 const char *locale, uint32_t options, 1010 const char *src, int32_t srcLength, 1011 char *dest, int32_t destCapacity, Edits *edits, 1012 UErrorCode &errorCode) { 1013 return ucasemap_mapUTF8( 1014 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL 1015 dest, destCapacity, 1016 src, srcLength, 1017 ucasemap_internalUTF8ToLower, edits, errorCode); 1018 } 1019 1020 int32_t CaseMap::utf8ToUpper( 1021 const char *locale, uint32_t options, 1022 const char *src, int32_t srcLength, 1023 char *dest, int32_t destCapacity, Edits *edits, 1024 UErrorCode &errorCode) { 1025 return ucasemap_mapUTF8( 1026 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL 1027 dest, destCapacity, 1028 src, srcLength, 1029 ucasemap_internalUTF8ToUpper, edits, errorCode); 1030 } 1031 1032 int32_t CaseMap::utf8Fold( 1033 uint32_t options, 1034 const char *src, int32_t srcLength, 1035 char *dest, int32_t destCapacity, Edits *edits, 1036 UErrorCode &errorCode) { 1037 return ucasemap_mapUTF8( 1038 UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL 1039 dest, destCapacity, 1040 src, srcLength, 1041 ucasemap_internalUTF8Fold, edits, errorCode); 1042 } 1043 1044 U_NAMESPACE_END