ucase.cpp (61666B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2004-2014, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: ucase.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2004aug30 16 * created by: Markus W. Scherer 17 * 18 * Low-level Unicode character/string case mapping code. 19 * Much code moved here (and modified) from uchar.c. 20 */ 21 22 #include "unicode/utypes.h" 23 #include "unicode/unistr.h" 24 #include "unicode/uset.h" 25 #include "unicode/utf16.h" 26 #include "cmemory.h" 27 #include "uassert.h" 28 #include "ucase.h" 29 #include "umutex.h" 30 #include "utrie2.h" 31 32 /* ucase_props_data.h is machine-generated by genprops/casepropsbuilder.cpp */ 33 #define INCLUDED_FROM_UCASE_CPP 34 #include "ucase_props_data.h" 35 36 /* set of property starts for UnicodeSet ------------------------------------ */ 37 38 static UBool U_CALLCONV 39 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) { 40 /* add the start code point to the USet */ 41 const USetAdder* sa = static_cast<const USetAdder*>(context); 42 sa->add(sa->set, start); 43 return true; 44 } 45 46 U_CFUNC void U_EXPORT2 47 ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) { 48 if(U_FAILURE(*pErrorCode)) { 49 return; 50 } 51 52 /* add the start code point of each same-value range of the trie */ 53 utrie2_enum(&ucase_props_singleton.trie, nullptr, _enumPropertyStartsRange, sa); 54 55 /* add code points with hardcoded properties, plus the ones following them */ 56 57 /* (none right now, see comment below) */ 58 59 /* 60 * Omit code points with hardcoded specialcasing properties 61 * because we do not build property UnicodeSets for them right now. 62 */ 63 } 64 65 /* data access primitives --------------------------------------------------- */ 66 67 U_CAPI const struct UCaseProps * U_EXPORT2 68 ucase_getSingleton(int32_t *pExceptionsLength, int32_t *pUnfoldLength) { 69 *pExceptionsLength = UPRV_LENGTHOF(ucase_props_exceptions); 70 *pUnfoldLength = UPRV_LENGTHOF(ucase_props_unfold); 71 return &ucase_props_singleton; 72 } 73 74 U_CFUNC const UTrie2 * U_EXPORT2 75 ucase_getTrie() { 76 return &ucase_props_singleton.trie; 77 } 78 79 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT)) 80 81 /* number of bits in an 8-bit integer value */ 82 static const uint8_t flagsOffset[256]={ 83 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 84 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 85 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 86 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 87 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 88 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 89 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 90 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 91 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 92 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 93 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 94 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 95 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 96 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 97 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 98 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 99 }; 100 101 #define HAS_SLOT(flags, idx) ((flags)&(1<<(idx))) 102 #define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)] 103 104 /* 105 * Get the value of an optional-value slot where HAS_SLOT(excWord, idx). 106 * 107 * @param excWord (in) initial exceptions word 108 * @param idx (in) desired slot index 109 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++; 110 * moved to the last uint16_t of the value, use +1 for beginning of next slot 111 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified 112 */ 113 #define GET_SLOT_VALUE(excWord, idx, pExc16, value) UPRV_BLOCK_MACRO_BEGIN { \ 114 if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \ 115 (pExc16)+=SLOT_OFFSET(excWord, idx); \ 116 (value)=*pExc16; \ 117 } else { \ 118 (pExc16)+=2*SLOT_OFFSET(excWord, idx); \ 119 (value)=*pExc16++; \ 120 (value)=((value)<<16)|*pExc16; \ 121 } \ 122 } UPRV_BLOCK_MACRO_END 123 124 /* simple case mappings ----------------------------------------------------- */ 125 126 U_CAPI UChar32 U_EXPORT2 127 ucase_tolower(UChar32 c) { 128 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); 129 if(!UCASE_HAS_EXCEPTION(props)) { 130 if(UCASE_IS_UPPER_OR_TITLE(props)) { 131 c+=UCASE_GET_DELTA(props); 132 } 133 } else { 134 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props); 135 uint16_t excWord=*pe++; 136 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) { 137 int32_t delta; 138 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta); 139 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta; 140 } 141 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { 142 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c); 143 } 144 } 145 return c; 146 } 147 148 U_CAPI UChar32 U_EXPORT2 149 ucase_toupper(UChar32 c) { 150 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); 151 if(!UCASE_HAS_EXCEPTION(props)) { 152 if(UCASE_GET_TYPE(props)==UCASE_LOWER) { 153 c+=UCASE_GET_DELTA(props); 154 } 155 } else { 156 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props); 157 uint16_t excWord=*pe++; 158 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) { 159 int32_t delta; 160 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta); 161 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta; 162 } 163 if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) { 164 GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c); 165 } 166 } 167 return c; 168 } 169 170 U_CAPI UChar32 U_EXPORT2 171 ucase_totitle(UChar32 c) { 172 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); 173 if(!UCASE_HAS_EXCEPTION(props)) { 174 if(UCASE_GET_TYPE(props)==UCASE_LOWER) { 175 c+=UCASE_GET_DELTA(props); 176 } 177 } else { 178 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props); 179 uint16_t excWord=*pe++; 180 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) { 181 int32_t delta; 182 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta); 183 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta; 184 } 185 int32_t idx; 186 if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) { 187 idx=UCASE_EXC_TITLE; 188 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) { 189 idx=UCASE_EXC_UPPER; 190 } else { 191 return c; 192 } 193 GET_SLOT_VALUE(excWord, idx, pe, c); 194 } 195 return c; 196 } 197 198 static const char16_t iDot[2] = { 0x69, 0x307 }; 199 static const char16_t jDot[2] = { 0x6a, 0x307 }; 200 static const char16_t iOgonekDot[3] = { 0x12f, 0x307 }; 201 static const char16_t iDotGrave[3] = { 0x69, 0x307, 0x300 }; 202 static const char16_t iDotAcute[3] = { 0x69, 0x307, 0x301 }; 203 static const char16_t iDotTilde[3] = { 0x69, 0x307, 0x303 }; 204 205 206 U_CFUNC void U_EXPORT2 207 ucase_addCaseClosure(UChar32 c, const USetAdder *sa) { 208 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); 209 if(!UCASE_HAS_EXCEPTION(props)) { 210 if(UCASE_GET_TYPE(props)!=UCASE_NONE) { 211 /* add the one simple case mapping, no matter what type it is */ 212 int32_t delta=UCASE_GET_DELTA(props); 213 if(delta!=0) { 214 sa->add(sa->set, c+delta); 215 } 216 } 217 } else { 218 /* 219 * c has exceptions, so there may be multiple simple and/or 220 * full case mappings. Add them all. 221 */ 222 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props); 223 uint16_t excWord=*pe++; 224 const uint16_t *pe0=pe; 225 226 // Hardcode the case closure of i and its relatives and ignore the 227 // data file data for these characters. 228 // The Turkic dotless i and dotted I with their case mapping conditions 229 // and case folding option make the related characters behave specially. 230 // This code matches their closure behavior to their case folding behavior. 231 if (excWord&UCASE_EXC_CONDITIONAL_FOLD) { 232 // These characters have Turkic case foldings. Hardcode their closure. 233 if (c == 0x49) { 234 // Regular i and I are in one equivalence class. 235 sa->add(sa->set, 0x69); 236 return; 237 } else if (c == 0x130) { 238 // Dotted I is in a class with <0069 0307> 239 // (for canonical equivalence with <0049 0307>). 240 sa->addString(sa->set, iDot, 2); 241 return; 242 } 243 } else if (c == 0x69) { 244 sa->add(sa->set, 0x49); 245 return; 246 } else if (c == 0x131) { 247 // Dotless i is in a class by itself. 248 return; 249 } 250 251 /* add all simple case mappings */ 252 for(int32_t idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) { 253 if(HAS_SLOT(excWord, idx)) { 254 pe=pe0; 255 UChar32 mapping; 256 GET_SLOT_VALUE(excWord, idx, pe, mapping); 257 sa->add(sa->set, mapping); 258 } 259 } 260 if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) { 261 pe=pe0; 262 int32_t delta; 263 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta); 264 sa->add(sa->set, (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta); 265 } 266 267 /* get the closure string pointer & length */ 268 const char16_t *closure; 269 int32_t closureLength; 270 if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) { 271 pe=pe0; 272 GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength); 273 closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */ 274 closure=(const char16_t *)pe+1; /* behind this slot, unless there are full case mappings */ 275 } else { 276 closureLength=0; 277 closure=nullptr; 278 } 279 280 /* add the full case folding */ 281 if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { 282 pe=pe0; 283 int32_t fullLength; 284 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength); 285 286 /* start of full case mapping strings */ 287 ++pe; 288 289 fullLength&=0xffff; /* bits 16 and higher are reserved */ 290 291 /* skip the lowercase result string */ 292 pe+=fullLength&UCASE_FULL_LOWER; 293 fullLength>>=4; 294 295 /* add the full case folding string */ 296 int32_t length=fullLength&0xf; 297 if(length!=0) { 298 sa->addString(sa->set, (const char16_t *)pe, length); 299 pe+=length; 300 } 301 302 /* skip the uppercase and titlecase strings */ 303 fullLength>>=4; 304 pe+=fullLength&0xf; 305 fullLength>>=4; 306 pe+=fullLength; 307 308 closure=(const char16_t *)pe; /* behind full case mappings */ 309 } 310 311 /* add each code point in the closure string */ 312 for(int32_t idx=0; idx<closureLength;) { 313 UChar32 mapping; 314 U16_NEXT_UNSAFE(closure, idx, mapping); 315 sa->add(sa->set, mapping); 316 } 317 } 318 } 319 320 U_CFUNC void U_EXPORT2 321 ucase_addSimpleCaseClosure(UChar32 c, const USetAdder *sa) { 322 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); 323 if(!UCASE_HAS_EXCEPTION(props)) { 324 if(UCASE_GET_TYPE(props)!=UCASE_NONE) { 325 /* add the one simple case mapping, no matter what type it is */ 326 int32_t delta=UCASE_GET_DELTA(props); 327 if(delta!=0) { 328 sa->add(sa->set, c+delta); 329 } 330 } 331 } else { 332 // c has exceptions. Add the mappings relevant for scf=Simple_Case_Folding. 333 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props); 334 uint16_t excWord=*pe++; 335 const uint16_t *pe0=pe; 336 337 // Hardcode the case closure of i and its relatives and ignore the 338 // data file data for these characters, like in ucase_addCaseClosure(). 339 if (excWord&UCASE_EXC_CONDITIONAL_FOLD) { 340 // These characters have Turkic case foldings. Hardcode their closure. 341 if (c == 0x49) { 342 // Regular i and I are in one equivalence class. 343 sa->add(sa->set, 0x69); 344 return; 345 } else if (c == 0x130) { 346 // For scf=Simple_Case_Folding, dotted I is in a class by itself. 347 return; 348 } 349 } else if (c == 0x69) { 350 sa->add(sa->set, 0x49); 351 return; 352 } else if (c == 0x131) { 353 // Dotless i is in a class by itself. 354 return; 355 } 356 357 // Add all simple case mappings. 358 for(int32_t idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) { 359 if(HAS_SLOT(excWord, idx)) { 360 pe=pe0; 361 UChar32 mapping; 362 GET_SLOT_VALUE(excWord, idx, pe, mapping); 363 sa->add(sa->set, mapping); 364 } 365 } 366 if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) { 367 pe=pe0; 368 int32_t delta; 369 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta); 370 UChar32 mapping = (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta; 371 sa->add(sa->set, mapping); 372 } 373 374 /* get the closure string pointer & length */ 375 const char16_t *closure; 376 int32_t closureLength; 377 if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) { 378 pe=pe0; 379 GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength); 380 closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */ 381 closure=(const char16_t *)pe+1; /* behind this slot, unless there are full case mappings */ 382 } else { 383 closureLength=0; 384 closure=nullptr; 385 } 386 387 // Skip the full case mappings. 388 if(closureLength > 0 && HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { 389 pe=pe0; 390 int32_t fullLength; 391 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength); 392 393 /* start of full case mapping strings */ 394 ++pe; 395 396 fullLength&=0xffff; /* bits 16 and higher are reserved */ 397 398 // Skip all 4 full case mappings. 399 pe+=fullLength&UCASE_FULL_LOWER; 400 fullLength>>=4; 401 pe+=fullLength&0xf; 402 fullLength>>=4; 403 pe+=fullLength&0xf; 404 fullLength>>=4; 405 pe+=fullLength; 406 407 closure=(const char16_t *)pe; /* behind full case mappings */ 408 } 409 410 // Add each code point in the closure string whose scf maps back to c. 411 for(int32_t idx=0; idx<closureLength;) { 412 UChar32 mapping; 413 U16_NEXT_UNSAFE(closure, idx, mapping); 414 sa->add(sa->set, mapping); 415 } 416 } 417 } 418 419 /* 420 * compare s, which has a length, with t, which has a maximum length or is NUL-terminated 421 * must be length>0 and max>0 and length<=max 422 */ 423 static inline int32_t 424 strcmpMax(const char16_t *s, int32_t length, const char16_t *t, int32_t max) { 425 int32_t c1, c2; 426 427 max-=length; /* we require length<=max, so no need to decrement max in the loop */ 428 do { 429 c1=*s++; 430 c2=*t++; 431 if(c2==0) { 432 return 1; /* reached the end of t but not of s */ 433 } 434 c1-=c2; 435 if(c1!=0) { 436 return c1; /* return difference result */ 437 } 438 } while(--length>0); 439 /* ends with length==0 */ 440 441 if(max==0 || *t==0) { 442 return 0; /* equal to length of both strings */ 443 } else { 444 return -max; /* return length difference */ 445 } 446 } 447 448 U_CFUNC UBool U_EXPORT2 449 ucase_addStringCaseClosure(const char16_t *s, int32_t length, const USetAdder *sa) { 450 int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth; 451 452 if(ucase_props_singleton.unfold==nullptr || s==nullptr) { 453 return false; /* no reverse case folding data, or no string */ 454 } 455 if(length<=1) { 456 /* the string is too short to find any match */ 457 /* 458 * more precise would be: 459 * if(!u_strHasMoreChar32Than(s, length, 1)) 460 * but this does not make much practical difference because 461 * a single supplementary code point would just not be found 462 */ 463 return false; 464 } 465 466 const uint16_t *unfold=ucase_props_singleton.unfold; 467 unfoldRows=unfold[UCASE_UNFOLD_ROWS]; 468 unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH]; 469 unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH]; 470 unfold+=unfoldRowWidth; 471 472 if(length>unfoldStringWidth) { 473 /* the string is too long to find any match */ 474 return false; 475 } 476 477 /* do a binary search for the string */ 478 start=0; 479 limit=unfoldRows; 480 while(start<limit) { 481 i=(start+limit)/2; 482 const char16_t *p=reinterpret_cast<const char16_t *>(unfold+(i*unfoldRowWidth)); 483 result=strcmpMax(s, length, p, unfoldStringWidth); 484 485 if(result==0) { 486 /* found the string: add each code point, and its case closure */ 487 UChar32 c; 488 489 for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) { 490 U16_NEXT_UNSAFE(p, i, c); 491 sa->add(sa->set, c); 492 ucase_addCaseClosure(c, sa); 493 } 494 return true; 495 } else if(result<0) { 496 limit=i; 497 } else /* result>0 */ { 498 start=i+1; 499 } 500 } 501 502 return false; /* string not found */ 503 } 504 505 U_NAMESPACE_BEGIN 506 507 FullCaseFoldingIterator::FullCaseFoldingIterator() 508 : unfold(reinterpret_cast<const char16_t *>(ucase_props_singleton.unfold)), 509 unfoldRows(unfold[UCASE_UNFOLD_ROWS]), 510 unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]), 511 unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]), 512 currentRow(0), 513 rowCpIndex(unfoldStringWidth) { 514 unfold+=unfoldRowWidth; 515 } 516 517 UChar32 518 FullCaseFoldingIterator::next(UnicodeString &full) { 519 // Advance past the last-delivered code point. 520 const char16_t *p=unfold+(currentRow*unfoldRowWidth); 521 if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) { 522 ++currentRow; 523 p+=unfoldRowWidth; 524 rowCpIndex=unfoldStringWidth; 525 } 526 if(currentRow>=unfoldRows) { return U_SENTINEL; } 527 // Set "full" to the NUL-terminated string in the first unfold column. 528 int32_t length=unfoldStringWidth; 529 while(length>0 && p[length-1]==0) { --length; } 530 full.setTo(false, p, length); 531 // Return the code point. 532 UChar32 c; 533 U16_NEXT_UNSAFE(p, rowCpIndex, c); 534 return c; 535 } 536 537 namespace LatinCase { 538 539 const int8_t TO_LOWER_NORMAL[LIMIT] = { 540 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 541 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 542 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 543 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 544 545 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 546 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0, 547 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 548 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 549 550 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 551 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 552 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 553 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 554 555 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 556 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC, 557 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 558 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 559 560 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 561 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 562 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 563 EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 564 565 0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0, 566 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 567 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 568 1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC 569 }; 570 571 const int8_t TO_LOWER_TR_LT[LIMIT] = { 572 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 573 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 574 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 575 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 576 577 0, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32, 32, 32, 32, 578 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0, 579 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 580 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 581 582 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 583 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 584 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 585 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 586 587 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32, 588 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC, 589 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 590 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 591 592 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 593 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 594 1, 0, 1, 0, 1, 0, 1, 0, EXC, 0, 1, 0, 1, 0, EXC, 0, 595 EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 596 597 0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0, 598 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 599 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 600 1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC 601 }; 602 603 const int8_t TO_UPPER_NORMAL[LIMIT] = { 604 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 605 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 606 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 607 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 608 609 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 610 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 611 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 612 -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0, 613 614 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 615 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 616 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 617 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 618 619 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 620 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC, 621 -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 622 -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121, 623 624 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 625 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 626 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 627 0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, 628 629 -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1, 630 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 631 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 632 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC 633 }; 634 635 const int8_t TO_UPPER_TR[LIMIT] = { 636 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 637 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 638 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 639 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 640 641 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 642 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 643 0, -32, -32, -32, -32, -32, -32, -32, -32, EXC, -32, -32, -32, -32, -32, -32, 644 -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0, 645 646 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 647 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 648 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 649 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 650 651 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 652 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC, 653 -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 654 -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121, 655 656 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 657 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 658 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 659 0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, 660 661 -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1, 662 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 663 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 664 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC 665 }; 666 667 } // namespace LatinCase 668 669 U_NAMESPACE_END 670 671 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */ 672 U_CAPI int32_t U_EXPORT2 673 ucase_getType(UChar32 c) { 674 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); 675 return UCASE_GET_TYPE(props); 676 } 677 678 /** @return same as ucase_getType() and set bit 2 if c is case-ignorable */ 679 U_CAPI int32_t U_EXPORT2 680 ucase_getTypeOrIgnorable(UChar32 c) { 681 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); 682 return UCASE_GET_TYPE_AND_IGNORABLE(props); 683 } 684 685 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */ 686 static inline int32_t 687 getDotType(UChar32 c) { 688 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); 689 if(!UCASE_HAS_EXCEPTION(props)) { 690 return props&UCASE_DOT_MASK; 691 } else { 692 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props); 693 return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK; 694 } 695 } 696 697 U_CAPI UBool U_EXPORT2 698 ucase_isSoftDotted(UChar32 c) { 699 return getDotType(c)==UCASE_SOFT_DOTTED; 700 } 701 702 U_CAPI UBool U_EXPORT2 703 ucase_isCaseSensitive(UChar32 c) { 704 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); 705 if(!UCASE_HAS_EXCEPTION(props)) { 706 return (props&UCASE_SENSITIVE)!=0; 707 } else { 708 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props); 709 return (*pe&UCASE_EXC_SENSITIVE)!=0; 710 } 711 } 712 713 /* string casing ------------------------------------------------------------ */ 714 715 /* 716 * These internal functions form the core of string case mappings. 717 * They map single code points to result code points or strings and take 718 * all necessary conditions (context, locale ID, options) into account. 719 * 720 * They do not iterate over the source or write to the destination 721 * so that the same functions are useful for non-standard string storage, 722 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc. 723 * For the same reason, the "surrounding text" context is passed in as a 724 * UCaseContextIterator which does not make any assumptions about 725 * the underlying storage. 726 * 727 * This section contains helper functions that check for conditions 728 * in the input text surrounding the current code point 729 * according to SpecialCasing.txt. 730 * 731 * Each helper function gets the index 732 * - after the current code point if it looks at following text 733 * - before the current code point if it looks at preceding text 734 * 735 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows: 736 * 737 * Final_Sigma 738 * C is preceded by a sequence consisting of 739 * a cased letter and a case-ignorable sequence, 740 * and C is not followed by a sequence consisting of 741 * an ignorable sequence and then a cased letter. 742 * 743 * More_Above 744 * C is followed by one or more characters of combining class 230 (ABOVE) 745 * in the combining character sequence. 746 * 747 * After_Soft_Dotted 748 * The last preceding character with combining class of zero before C 749 * was Soft_Dotted, 750 * and there is no intervening combining character class 230 (ABOVE). 751 * 752 * Before_Dot 753 * C is followed by combining dot above (U+0307). 754 * Any sequence of characters with a combining class that is neither 0 nor 230 755 * may intervene between the current character and the combining dot above. 756 * 757 * The erratum from 2002-10-31 adds the condition 758 * 759 * After_I 760 * The last preceding base character was an uppercase I, and there is no 761 * intervening combining character class 230 (ABOVE). 762 * 763 * (See Jitterbug 2344 and the comments on After_I below.) 764 * 765 * Helper definitions in Unicode 3.2 UAX 21: 766 * 767 * D1. A character C is defined to be cased 768 * if it meets any of the following criteria: 769 * 770 * - The general category of C is Titlecase Letter (Lt) 771 * - In [CoreProps], C has one of the properties Uppercase, or Lowercase 772 * - Given D = NFD(C), then it is not the case that: 773 * D = UCD_lower(D) = UCD_upper(D) = UCD_title(D) 774 * (This third criterion does not add any characters to the list 775 * for Unicode 3.2. Ignored.) 776 * 777 * D2. A character C is defined to be case-ignorable 778 * if it meets either of the following criteria: 779 * 780 * - The general category of C is 781 * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or 782 * Letter Modifier (Lm), or Symbol Modifier (Sk) 783 * - C is one of the following characters 784 * U+0027 APOSTROPHE 785 * U+00AD SOFT HYPHEN (SHY) 786 * U+2019 RIGHT SINGLE QUOTATION MARK 787 * (the preferred character for apostrophe) 788 * 789 * D3. A case-ignorable sequence is a sequence of 790 * zero or more case-ignorable characters. 791 */ 792 793 #define is_d(c) ((c)=='d' || (c)=='D') 794 #define is_e(c) ((c)=='e' || (c)=='E') 795 #define is_i(c) ((c)=='i' || (c)=='I') 796 #define is_l(c) ((c)=='l' || (c)=='L') 797 #define is_r(c) ((c)=='r' || (c)=='R') 798 #define is_t(c) ((c)=='t' || (c)=='T') 799 #define is_u(c) ((c)=='u' || (c)=='U') 800 #define is_y(c) ((c)=='y' || (c)=='Y') 801 #define is_z(c) ((c)=='z' || (c)=='Z') 802 803 /* separator? */ 804 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0) 805 806 /** 807 * Requires non-nullptr locale ID but otherwise does the equivalent of 808 * checking for language codes as if uloc_getLanguage() were called: 809 * Accepts both 2- and 3-letter codes and accepts case variants. 810 */ 811 U_CFUNC int32_t 812 ucase_getCaseLocale(const char *locale) { 813 /* 814 * This function used to use uloc_getLanguage(), but the current code 815 * removes the dependency of this low-level code on uloc implementation code 816 * and is faster because not the whole locale ID has to be 817 * examined and copied/transformed. 818 * 819 * Because this code does not want to depend on uloc, the caller must 820 * pass in a non-nullptr locale, i.e., may need to call uloc_getDefault(). 821 */ 822 char c=*locale++; 823 // Fastpath for English "en" which is often used for default (=root locale) case mappings, 824 // and for Chinese "zh": Very common but no special case mapping behavior. 825 // Then check lowercase vs. uppercase to reduce the number of comparisons 826 // for other locales without special behavior. 827 if(c=='e') { 828 /* el or ell? */ 829 c=*locale++; 830 if(is_l(c)) { 831 c=*locale++; 832 if(is_l(c)) { 833 c=*locale; 834 } 835 if(is_sep(c)) { 836 return UCASE_LOC_GREEK; 837 } 838 } 839 // en, es, ... -> root 840 } else if(c=='z') { 841 return UCASE_LOC_ROOT; 842 #if U_CHARSET_FAMILY==U_ASCII_FAMILY 843 } else if(c>='a') { // ASCII a-z = 0x61..0x7a, after A-Z 844 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY 845 } else if(c<='z') { // EBCDIC a-z = 0x81..0xa9 with two gaps, before A-Z 846 #else 847 # error Unknown charset family! 848 #endif 849 // lowercase c 850 if(c=='t') { 851 /* tr or tur? */ 852 c=*locale++; 853 if(is_u(c)) { 854 c=*locale++; 855 } 856 if(is_r(c)) { 857 c=*locale; 858 if(is_sep(c)) { 859 return UCASE_LOC_TURKISH; 860 } 861 } 862 } else if(c=='a') { 863 /* az or aze? */ 864 c=*locale++; 865 if(is_z(c)) { 866 c=*locale++; 867 if(is_e(c)) { 868 c=*locale; 869 } 870 if(is_sep(c)) { 871 return UCASE_LOC_TURKISH; 872 } 873 } 874 } else if(c=='l') { 875 /* lt or lit? */ 876 c=*locale++; 877 if(is_i(c)) { 878 c=*locale++; 879 } 880 if(is_t(c)) { 881 c=*locale; 882 if(is_sep(c)) { 883 return UCASE_LOC_LITHUANIAN; 884 } 885 } 886 } else if(c=='n') { 887 /* nl or nld? */ 888 c=*locale++; 889 if(is_l(c)) { 890 c=*locale++; 891 if(is_d(c)) { 892 c=*locale; 893 } 894 if(is_sep(c)) { 895 return UCASE_LOC_DUTCH; 896 } 897 } 898 } else if(c=='h') { 899 /* hy or hye? *not* hyw */ 900 c=*locale++; 901 if(is_y(c)) { 902 c=*locale++; 903 if(is_e(c)) { 904 c=*locale; 905 } 906 if(is_sep(c)) { 907 return UCASE_LOC_ARMENIAN; 908 } 909 } 910 } 911 } else { 912 // uppercase c 913 // Same code as for lowercase c but also check for 'E'. 914 if(c=='T') { 915 /* tr or tur? */ 916 c=*locale++; 917 if(is_u(c)) { 918 c=*locale++; 919 } 920 if(is_r(c)) { 921 c=*locale; 922 if(is_sep(c)) { 923 return UCASE_LOC_TURKISH; 924 } 925 } 926 } else if(c=='A') { 927 /* az or aze? */ 928 c=*locale++; 929 if(is_z(c)) { 930 c=*locale++; 931 if(is_e(c)) { 932 c=*locale; 933 } 934 if(is_sep(c)) { 935 return UCASE_LOC_TURKISH; 936 } 937 } 938 } else if(c=='L') { 939 /* lt or lit? */ 940 c=*locale++; 941 if(is_i(c)) { 942 c=*locale++; 943 } 944 if(is_t(c)) { 945 c=*locale; 946 if(is_sep(c)) { 947 return UCASE_LOC_LITHUANIAN; 948 } 949 } 950 } else if(c=='E') { 951 /* el or ell? */ 952 c=*locale++; 953 if(is_l(c)) { 954 c=*locale++; 955 if(is_l(c)) { 956 c=*locale; 957 } 958 if(is_sep(c)) { 959 return UCASE_LOC_GREEK; 960 } 961 } 962 } else if(c=='N') { 963 /* nl or nld? */ 964 c=*locale++; 965 if(is_l(c)) { 966 c=*locale++; 967 if(is_d(c)) { 968 c=*locale; 969 } 970 if(is_sep(c)) { 971 return UCASE_LOC_DUTCH; 972 } 973 } 974 } else if(c=='H') { 975 /* hy or hye? *not* hyw */ 976 c=*locale++; 977 if(is_y(c)) { 978 c=*locale++; 979 if(is_e(c)) { 980 c=*locale; 981 } 982 if(is_sep(c)) { 983 return UCASE_LOC_ARMENIAN; 984 } 985 } 986 } 987 } 988 return UCASE_LOC_ROOT; 989 } 990 991 /* 992 * Is followed by 993 * {case-ignorable}* cased 994 * ? 995 * (dir determines looking forward/backward) 996 * If a character is case-ignorable, it is skipped regardless of whether 997 * it is also cased or not. 998 */ 999 static UBool 1000 isFollowedByCasedLetter(UCaseContextIterator *iter, void *context, int8_t dir) { 1001 UChar32 c; 1002 1003 if(iter==nullptr) { 1004 return false; 1005 } 1006 1007 for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) { 1008 int32_t type=ucase_getTypeOrIgnorable(c); 1009 if(type&4) { 1010 /* case-ignorable, continue with the loop */ 1011 } else if(type!=UCASE_NONE) { 1012 return true; /* followed by cased letter */ 1013 } else { 1014 return false; /* uncased and not case-ignorable */ 1015 } 1016 } 1017 1018 return false; /* not followed by cased letter */ 1019 } 1020 1021 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */ 1022 static UBool 1023 isPrecededBySoftDotted(UCaseContextIterator *iter, void *context) { 1024 UChar32 c; 1025 int32_t dotType; 1026 int8_t dir; 1027 1028 if(iter==nullptr) { 1029 return false; 1030 } 1031 1032 for(dir=-1; (c=iter(context, dir))>=0; dir=0) { 1033 dotType=getDotType(c); 1034 if(dotType==UCASE_SOFT_DOTTED) { 1035 return true; /* preceded by TYPE_i */ 1036 } else if(dotType!=UCASE_OTHER_ACCENT) { 1037 return false; /* preceded by different base character (not TYPE_i), or intervening cc==230 */ 1038 } 1039 } 1040 1041 return false; /* not preceded by TYPE_i */ 1042 } 1043 1044 /* 1045 * See Jitterbug 2344: 1046 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above 1047 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because 1048 * we made those releases compatible with Unicode 3.2 which had not fixed 1049 * a related bug in SpecialCasing.txt. 1050 * 1051 * From the Jitterbug 2344 text: 1052 * ... this bug is listed as a Unicode erratum 1053 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html 1054 * <quote> 1055 * There are two errors in SpecialCasing.txt. 1056 * 1. Missing semicolons on two lines. ... [irrelevant for ICU] 1057 * 2. An incorrect context definition. Correct as follows: 1058 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE 1059 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE 1060 * --- 1061 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE 1062 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE 1063 * where the context After_I is defined as: 1064 * The last preceding base character was an uppercase I, and there is no 1065 * intervening combining character class 230 (ABOVE). 1066 * </quote> 1067 * 1068 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as: 1069 * 1070 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. 1071 * # This matches the behavior of the canonically equivalent I-dot_above 1072 * 1073 * See also the description in this place in older versions of uchar.c (revision 1.100). 1074 * 1075 * Markus W. Scherer 2003-feb-15 1076 */ 1077 1078 /* Is preceded by base character 'I' with no intervening cc=230 ? */ 1079 static UBool 1080 isPrecededBy_I(UCaseContextIterator *iter, void *context) { 1081 UChar32 c; 1082 int32_t dotType; 1083 int8_t dir; 1084 1085 if(iter==nullptr) { 1086 return false; 1087 } 1088 1089 for(dir=-1; (c=iter(context, dir))>=0; dir=0) { 1090 if(c==0x49) { 1091 return true; /* preceded by I */ 1092 } 1093 dotType=getDotType(c); 1094 if(dotType!=UCASE_OTHER_ACCENT) { 1095 return false; /* preceded by different base character (not I), or intervening cc==230 */ 1096 } 1097 } 1098 1099 return false; /* not preceded by I */ 1100 } 1101 1102 /* Is followed by one or more cc==230 ? */ 1103 static UBool 1104 isFollowedByMoreAbove(UCaseContextIterator *iter, void *context) { 1105 UChar32 c; 1106 int32_t dotType; 1107 int8_t dir; 1108 1109 if(iter==nullptr) { 1110 return false; 1111 } 1112 1113 for(dir=1; (c=iter(context, dir))>=0; dir=0) { 1114 dotType=getDotType(c); 1115 if(dotType==UCASE_ABOVE) { 1116 return true; /* at least one cc==230 following */ 1117 } else if(dotType!=UCASE_OTHER_ACCENT) { 1118 return false; /* next base character, no more cc==230 following */ 1119 } 1120 } 1121 1122 return false; /* no more cc==230 following */ 1123 } 1124 1125 /* Is followed by a dot above (without cc==230 in between) ? */ 1126 static UBool 1127 isFollowedByDotAbove(UCaseContextIterator *iter, void *context) { 1128 UChar32 c; 1129 int32_t dotType; 1130 int8_t dir; 1131 1132 if(iter==nullptr) { 1133 return false; 1134 } 1135 1136 for(dir=1; (c=iter(context, dir))>=0; dir=0) { 1137 if(c==0x307) { 1138 return true; 1139 } 1140 dotType=getDotType(c); 1141 if(dotType!=UCASE_OTHER_ACCENT) { 1142 return false; /* next base character or cc==230 in between */ 1143 } 1144 } 1145 1146 return false; /* no dot above following */ 1147 } 1148 1149 U_CAPI int32_t U_EXPORT2 1150 ucase_toFullLower(UChar32 c, 1151 UCaseContextIterator *iter, void *context, 1152 const char16_t **pString, 1153 int32_t loc) { 1154 // The sign of the result has meaning, input must be non-negative so that it can be returned as is. 1155 U_ASSERT(c >= 0); 1156 UChar32 result=c; 1157 // Reset the output pointer in case it was uninitialized. 1158 *pString=nullptr; 1159 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); 1160 if(!UCASE_HAS_EXCEPTION(props)) { 1161 if(UCASE_IS_UPPER_OR_TITLE(props)) { 1162 result=c+UCASE_GET_DELTA(props); 1163 } 1164 } else { 1165 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2; 1166 uint16_t excWord=*pe++; 1167 int32_t full; 1168 1169 pe2=pe; 1170 1171 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) { 1172 /* use hardcoded conditions and mappings */ 1173 1174 /* 1175 * Test for conditional mappings first 1176 * (otherwise the unconditional default mappings are always taken), 1177 * then test for characters that have unconditional mappings in SpecialCasing.txt, 1178 * then get the UnicodeData.txt mappings. 1179 */ 1180 if( loc==UCASE_LOC_LITHUANIAN && 1181 /* base characters, find accents above */ 1182 (((c==0x49 || c==0x4a || c==0x12e) && 1183 isFollowedByMoreAbove(iter, context)) || 1184 /* precomposed with accent above, no need to find one */ 1185 (c==0xcc || c==0xcd || c==0x128)) 1186 ) { 1187 /* 1188 # Lithuanian 1189 1190 # Lithuanian retains the dot in a lowercase i when followed by accents. 1191 1192 # Introduce an explicit dot above when lowercasing capital I's and J's 1193 # whenever there are more accents above. 1194 # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek) 1195 1196 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I 1197 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J 1198 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK 1199 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE 1200 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE 1201 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE 1202 */ 1203 switch(c) { 1204 case 0x49: /* LATIN CAPITAL LETTER I */ 1205 *pString=iDot; 1206 return 2; 1207 case 0x4a: /* LATIN CAPITAL LETTER J */ 1208 *pString=jDot; 1209 return 2; 1210 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */ 1211 *pString=iOgonekDot; 1212 return 2; 1213 case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */ 1214 *pString=iDotGrave; 1215 return 3; 1216 case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */ 1217 *pString=iDotAcute; 1218 return 3; 1219 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */ 1220 *pString=iDotTilde; 1221 return 3; 1222 default: 1223 return 0; /* will not occur */ 1224 } 1225 /* # Turkish and Azeri */ 1226 } else if(loc==UCASE_LOC_TURKISH && c==0x130) { 1227 /* 1228 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri 1229 # The following rules handle those cases. 1230 1231 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE 1232 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE 1233 */ 1234 return 0x69; 1235 } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(iter, context)) { 1236 /* 1237 # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. 1238 # This matches the behavior of the canonically equivalent I-dot_above 1239 1240 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE 1241 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE 1242 */ 1243 return 0; /* remove the dot (continue without output) */ 1244 } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter, context)) { 1245 /* 1246 # When lowercasing, unless an I is before a dot_above, it turns into a dotless i. 1247 1248 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I 1249 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I 1250 */ 1251 return 0x131; 1252 } else if(c==0x130) { 1253 /* 1254 # Preserve canonical equivalence for I with dot. Turkic is handled below. 1255 1256 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE 1257 */ 1258 *pString=iDot; 1259 return 2; 1260 } else if( c==0x3a3 && 1261 !isFollowedByCasedLetter(iter, context, 1) && 1262 isFollowedByCasedLetter(iter, context, -1) /* -1=preceded */ 1263 ) { 1264 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */ 1265 /* 1266 # Special case for final form of sigma 1267 1268 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA 1269 */ 1270 return 0x3c2; /* greek small final sigma */ 1271 } else { 1272 /* no known conditional special case mapping, use a normal mapping */ 1273 } 1274 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { 1275 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full); 1276 full&=UCASE_FULL_LOWER; 1277 if(full!=0) { 1278 /* set the output pointer to the lowercase mapping */ 1279 *pString=reinterpret_cast<const char16_t *>(pe+1); 1280 1281 /* return the string length */ 1282 return full; 1283 } 1284 } 1285 1286 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) { 1287 int32_t delta; 1288 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta); 1289 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta; 1290 } 1291 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { 1292 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result); 1293 } 1294 } 1295 1296 return (result==c) ? ~result : result; 1297 } 1298 1299 /* internal */ 1300 static int32_t 1301 toUpperOrTitle(UChar32 c, 1302 UCaseContextIterator *iter, void *context, 1303 const char16_t **pString, 1304 int32_t loc, 1305 UBool upperNotTitle) { 1306 // The sign of the result has meaning, input must be non-negative so that it can be returned as is. 1307 U_ASSERT(c >= 0); 1308 UChar32 result=c; 1309 // Reset the output pointer in case it was uninitialized. 1310 *pString=nullptr; 1311 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); 1312 if(!UCASE_HAS_EXCEPTION(props)) { 1313 if(UCASE_GET_TYPE(props)==UCASE_LOWER) { 1314 result=c+UCASE_GET_DELTA(props); 1315 } 1316 } else { 1317 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2; 1318 uint16_t excWord=*pe++; 1319 int32_t full, idx; 1320 1321 pe2=pe; 1322 1323 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) { 1324 /* use hardcoded conditions and mappings */ 1325 if(loc==UCASE_LOC_TURKISH && c==0x69) { 1326 /* 1327 # Turkish and Azeri 1328 1329 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri 1330 # The following rules handle those cases. 1331 1332 # When uppercasing, i turns into a dotted capital I 1333 1334 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I 1335 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I 1336 */ 1337 return 0x130; 1338 } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(iter, context)) { 1339 /* 1340 # Lithuanian 1341 1342 # Lithuanian retains the dot in a lowercase i when followed by accents. 1343 1344 # Remove DOT ABOVE after "i" with upper or titlecase 1345 1346 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE 1347 */ 1348 return 0; /* remove the dot (continue without output) */ 1349 } else if(c==0x0587) { 1350 // See ICU-13416: 1351 // և ligature ech-yiwn 1352 // uppercases to ԵՒ=ech+yiwn by default and in Western Armenian, 1353 // but to ԵՎ=ech+vew in Eastern Armenian. 1354 if(loc==UCASE_LOC_ARMENIAN) { 1355 *pString=upperNotTitle ? u"ԵՎ" : u"Եվ"; 1356 } else { 1357 *pString=upperNotTitle ? u"ԵՒ" : u"Եւ"; 1358 } 1359 return 2; 1360 } else { 1361 /* no known conditional special case mapping, use a normal mapping */ 1362 } 1363 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { 1364 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full); 1365 1366 /* start of full case mapping strings */ 1367 ++pe; 1368 1369 /* skip the lowercase and case-folding result strings */ 1370 pe+=full&UCASE_FULL_LOWER; 1371 full>>=4; 1372 pe+=full&0xf; 1373 full>>=4; 1374 1375 if(upperNotTitle) { 1376 full&=0xf; 1377 } else { 1378 /* skip the uppercase result string */ 1379 pe+=full&0xf; 1380 full=(full>>4)&0xf; 1381 } 1382 1383 if(full!=0) { 1384 /* set the output pointer to the result string */ 1385 *pString=reinterpret_cast<const char16_t *>(pe); 1386 1387 /* return the string length */ 1388 return full; 1389 } 1390 } 1391 1392 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) { 1393 int32_t delta; 1394 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta); 1395 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta; 1396 } 1397 if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) { 1398 idx=UCASE_EXC_TITLE; 1399 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) { 1400 /* here, titlecase is same as uppercase */ 1401 idx=UCASE_EXC_UPPER; 1402 } else { 1403 return ~c; 1404 } 1405 GET_SLOT_VALUE(excWord, idx, pe2, result); 1406 } 1407 1408 return (result==c) ? ~result : result; 1409 } 1410 1411 U_CAPI int32_t U_EXPORT2 1412 ucase_toFullUpper(UChar32 c, 1413 UCaseContextIterator *iter, void *context, 1414 const char16_t **pString, 1415 int32_t caseLocale) { 1416 return toUpperOrTitle(c, iter, context, pString, caseLocale, true); 1417 } 1418 1419 U_CAPI int32_t U_EXPORT2 1420 ucase_toFullTitle(UChar32 c, 1421 UCaseContextIterator *iter, void *context, 1422 const char16_t **pString, 1423 int32_t caseLocale) { 1424 return toUpperOrTitle(c, iter, context, pString, caseLocale, false); 1425 } 1426 1427 /* case folding ------------------------------------------------------------- */ 1428 1429 /* 1430 * Case folding is similar to lowercasing. 1431 * The result may be a simple mapping, i.e., a single code point, or 1432 * a full mapping, i.e., a string. 1433 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping, 1434 * then only the lowercase mapping is stored. 1435 * 1436 * Some special cases are hardcoded because their conditions cannot be 1437 * parsed and processed from CaseFolding.txt. 1438 * 1439 * Unicode 3.2 CaseFolding.txt specifies for its status field: 1440 1441 # C: common case folding, common mappings shared by both simple and full mappings. 1442 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces. 1443 # S: simple case folding, mappings to single characters where different from F. 1444 # T: special case for uppercase I and dotted uppercase I 1445 # - For non-Turkic languages, this mapping is normally not used. 1446 # - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters. 1447 # 1448 # Usage: 1449 # A. To do a simple case folding, use the mappings with status C + S. 1450 # B. To do a full case folding, use the mappings with status C + F. 1451 # 1452 # The mappings with status T can be used or omitted depending on the desired case-folding 1453 # behavior. (The default option is to exclude them.) 1454 1455 * Unicode 3.2 has 'T' mappings as follows: 1456 1457 0049; T; 0131; # LATIN CAPITAL LETTER I 1458 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE 1459 1460 * while the default mappings for these code points are: 1461 1462 0049; C; 0069; # LATIN CAPITAL LETTER I 1463 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE 1464 1465 * U+0130 has no simple case folding (simple-case-folds to itself). 1466 */ 1467 1468 /* return the simple case folding mapping for c */ 1469 U_CAPI UChar32 U_EXPORT2 1470 ucase_fold(UChar32 c, uint32_t options) { 1471 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); 1472 if(!UCASE_HAS_EXCEPTION(props)) { 1473 if(UCASE_IS_UPPER_OR_TITLE(props)) { 1474 c+=UCASE_GET_DELTA(props); 1475 } 1476 } else { 1477 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props); 1478 uint16_t excWord=*pe++; 1479 int32_t idx; 1480 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) { 1481 /* special case folding mappings, hardcoded */ 1482 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) { 1483 /* default mappings */ 1484 if(c==0x49) { 1485 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */ 1486 return 0x69; 1487 } else if(c==0x130) { 1488 /* no simple case folding for U+0130 */ 1489 return c; 1490 } 1491 } else { 1492 /* Turkic mappings */ 1493 if(c==0x49) { 1494 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */ 1495 return 0x131; 1496 } else if(c==0x130) { 1497 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ 1498 return 0x69; 1499 } 1500 } 1501 } 1502 if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) { 1503 return c; 1504 } 1505 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) { 1506 int32_t delta; 1507 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta); 1508 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta; 1509 } 1510 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) { 1511 idx=UCASE_EXC_FOLD; 1512 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { 1513 idx=UCASE_EXC_LOWER; 1514 } else { 1515 return c; 1516 } 1517 GET_SLOT_VALUE(excWord, idx, pe, c); 1518 } 1519 return c; 1520 } 1521 1522 /* 1523 * Issue for canonical caseless match (UAX #21): 1524 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve 1525 * canonical equivalence, unlike default-option casefolding. 1526 * For example, I-grave and I + grave fold to strings that are not canonically 1527 * equivalent. 1528 * For more details, see the comment in unorm_compare() in unorm.cpp 1529 * and the intermediate prototype changes for Jitterbug 2021. 1530 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.) 1531 * 1532 * This did not get fixed because it appears that it is not possible to fix 1533 * it for uppercase and lowercase characters (I-grave vs. i-grave) 1534 * together in a way that they still fold to common result strings. 1535 */ 1536 1537 U_CAPI int32_t U_EXPORT2 1538 ucase_toFullFolding(UChar32 c, 1539 const char16_t **pString, 1540 uint32_t options) { 1541 // The sign of the result has meaning, input must be non-negative so that it can be returned as is. 1542 U_ASSERT(c >= 0); 1543 UChar32 result=c; 1544 // Reset the output pointer in case it was uninitialized. 1545 *pString=nullptr; 1546 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); 1547 if(!UCASE_HAS_EXCEPTION(props)) { 1548 if(UCASE_IS_UPPER_OR_TITLE(props)) { 1549 result=c+UCASE_GET_DELTA(props); 1550 } 1551 } else { 1552 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2; 1553 uint16_t excWord=*pe++; 1554 int32_t full, idx; 1555 1556 pe2=pe; 1557 1558 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) { 1559 /* use hardcoded conditions and mappings */ 1560 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) { 1561 /* default mappings */ 1562 if(c==0x49) { 1563 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */ 1564 return 0x69; 1565 } else if(c==0x130) { 1566 /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ 1567 *pString=iDot; 1568 return 2; 1569 } 1570 } else { 1571 /* Turkic mappings */ 1572 if(c==0x49) { 1573 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */ 1574 return 0x131; 1575 } else if(c==0x130) { 1576 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ 1577 return 0x69; 1578 } 1579 } 1580 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { 1581 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full); 1582 1583 /* start of full case mapping strings */ 1584 ++pe; 1585 1586 /* skip the lowercase result string */ 1587 pe+=full&UCASE_FULL_LOWER; 1588 full=(full>>4)&0xf; 1589 1590 if(full!=0) { 1591 /* set the output pointer to the result string */ 1592 *pString=reinterpret_cast<const char16_t *>(pe); 1593 1594 /* return the string length */ 1595 return full; 1596 } 1597 } 1598 1599 if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) { 1600 return ~c; 1601 } 1602 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) { 1603 int32_t delta; 1604 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta); 1605 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta; 1606 } 1607 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) { 1608 idx=UCASE_EXC_FOLD; 1609 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { 1610 idx=UCASE_EXC_LOWER; 1611 } else { 1612 return ~c; 1613 } 1614 GET_SLOT_VALUE(excWord, idx, pe2, result); 1615 } 1616 1617 return (result==c) ? ~result : result; 1618 } 1619 1620 /* case mapping properties API ---------------------------------------------- */ 1621 1622 /* public API (see uchar.h) */ 1623 1624 U_CAPI UBool U_EXPORT2 1625 u_isULowercase(UChar32 c) { 1626 return UCASE_LOWER==ucase_getType(c); 1627 } 1628 1629 U_CAPI UBool U_EXPORT2 1630 u_isUUppercase(UChar32 c) { 1631 return UCASE_UPPER==ucase_getType(c); 1632 } 1633 1634 /* Transforms the Unicode character to its lower case equivalent.*/ 1635 U_CAPI UChar32 U_EXPORT2 1636 u_tolower(UChar32 c) { 1637 return ucase_tolower(c); 1638 } 1639 1640 /* Transforms the Unicode character to its upper case equivalent.*/ 1641 U_CAPI UChar32 U_EXPORT2 1642 u_toupper(UChar32 c) { 1643 return ucase_toupper(c); 1644 } 1645 1646 /* Transforms the Unicode character to its title case equivalent.*/ 1647 U_CAPI UChar32 U_EXPORT2 1648 u_totitle(UChar32 c) { 1649 return ucase_totitle(c); 1650 } 1651 1652 /* return the simple case folding mapping for c */ 1653 U_CAPI UChar32 U_EXPORT2 1654 u_foldCase(UChar32 c, uint32_t options) { 1655 return ucase_fold(c, options); 1656 } 1657 1658 U_CFUNC int32_t U_EXPORT2 1659 ucase_hasBinaryProperty(UChar32 c, UProperty which) { 1660 /* case mapping properties */ 1661 const char16_t *resultString; 1662 switch(which) { 1663 case UCHAR_LOWERCASE: 1664 return (UBool)(UCASE_LOWER==ucase_getType(c)); 1665 case UCHAR_UPPERCASE: 1666 return (UBool)(UCASE_UPPER==ucase_getType(c)); 1667 case UCHAR_SOFT_DOTTED: 1668 return ucase_isSoftDotted(c); 1669 case UCHAR_CASE_SENSITIVE: 1670 return ucase_isCaseSensitive(c); 1671 case UCHAR_CASED: 1672 return (UBool)(UCASE_NONE!=ucase_getType(c)); 1673 case UCHAR_CASE_IGNORABLE: 1674 return (UBool)(ucase_getTypeOrIgnorable(c)>>2); 1675 /* 1676 * Note: The following Changes_When_Xyz are defined as testing whether 1677 * the NFD form of the input changes when Xyz-case-mapped. 1678 * However, this simpler implementation of these properties, 1679 * ignoring NFD, passes the tests. 1680 * The implementation needs to be changed if the tests start failing. 1681 * When that happens, optimizations should be used to work with the 1682 * per-single-code point ucase_toFullXyz() functions unless 1683 * the NFD form has more than one code point, 1684 * and the property starts set needs to be the union of the 1685 * start sets for normalization and case mappings. 1686 */ 1687 case UCHAR_CHANGES_WHEN_LOWERCASED: 1688 return (UBool)(ucase_toFullLower(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0); 1689 case UCHAR_CHANGES_WHEN_UPPERCASED: 1690 return (UBool)(ucase_toFullUpper(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0); 1691 case UCHAR_CHANGES_WHEN_TITLECASED: 1692 return (UBool)(ucase_toFullTitle(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0); 1693 /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */ 1694 case UCHAR_CHANGES_WHEN_CASEMAPPED: 1695 return (UBool)( 1696 ucase_toFullLower(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0 || 1697 ucase_toFullUpper(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0 || 1698 ucase_toFullTitle(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0); 1699 default: 1700 return false; 1701 } 1702 }