uchar.cpp (22290B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************** 5 * Copyright (C) 1996-2016, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************** 8 * 9 * File UCHAR.C 10 * 11 * Modification History: 12 * 13 * Date Name Description 14 * 04/02/97 aliu Creation. 15 * 4/15/99 Madhu Updated all the function definitions for C Implementation 16 * 5/20/99 Madhu Added the function u_getVersion() 17 * 8/19/1999 srl Upgraded scripts to Unicode3.0 18 * 11/11/1999 weiv added u_isalnum(), cleaned comments 19 * 01/11/2000 helena Renamed u_getVersion to u_getUnicodeVersion. 20 * 06/20/2000 helena OS/400 port changes; mostly typecast. 21 ****************************************************************************** 22 */ 23 24 #include "unicode/utypes.h" 25 #include "unicode/uchar.h" 26 #include "unicode/ucptrie.h" 27 #include "unicode/uscript.h" 28 #include "unicode/udata.h" 29 #include "uassert.h" 30 #include "cmemory.h" 31 #include "ucln_cmn.h" 32 #include "utrie2.h" 33 #include "udataswp.h" 34 #include "uprops.h" 35 #include "ustr_imp.h" 36 37 /* uchar_props_data.h is machine-generated by genprops --csource */ 38 #define INCLUDED_FROM_UCHAR_C 39 #include "uchar_props_data.h" 40 41 /* constants and macros for access to the data ------------------------------ */ 42 43 /* getting a uint32_t properties word from the data */ 44 #define GET_PROPS(c, result) ((result)=UTRIE2_GET16(&propsTrie, c)) 45 46 /* API functions ------------------------------------------------------------ */ 47 48 /* Gets the Unicode character's general category.*/ 49 U_CAPI int8_t U_EXPORT2 50 u_charType(UChar32 c) { 51 uint32_t props; 52 GET_PROPS(c, props); 53 return (int8_t)GET_CATEGORY(props); 54 } 55 56 /* Enumerate all code points with their general categories. */ 57 struct _EnumTypeCallback { 58 UCharEnumTypeRange *enumRange; 59 const void *context; 60 }; 61 62 static uint32_t U_CALLCONV 63 _enumTypeValue(const void *context, uint32_t value) { 64 (void)context; 65 return GET_CATEGORY(value); 66 } 67 68 static UBool U_CALLCONV 69 _enumTypeRange(const void *context, UChar32 start, UChar32 end, uint32_t value) { 70 /* just cast the value to UCharCategory */ 71 return static_cast<const _EnumTypeCallback*>(context)-> 72 enumRange(static_cast<const _EnumTypeCallback*>(context)->context, 73 start, end + 1, static_cast<UCharCategory>(value)); 74 } 75 76 U_CAPI void U_EXPORT2 77 u_enumCharTypes(UCharEnumTypeRange *enumRange, const void *context) { 78 struct _EnumTypeCallback callback; 79 80 if(enumRange==nullptr) { 81 return; 82 } 83 84 callback.enumRange=enumRange; 85 callback.context=context; 86 utrie2_enum(&propsTrie, _enumTypeValue, _enumTypeRange, &callback); 87 } 88 89 /* Checks if ch is a lower case letter.*/ 90 U_CAPI UBool U_EXPORT2 91 u_islower(UChar32 c) { 92 uint32_t props; 93 GET_PROPS(c, props); 94 return GET_CATEGORY(props)==U_LOWERCASE_LETTER; 95 } 96 97 /* Checks if ch is an upper case letter.*/ 98 U_CAPI UBool U_EXPORT2 99 u_isupper(UChar32 c) { 100 uint32_t props; 101 GET_PROPS(c, props); 102 return GET_CATEGORY(props)==U_UPPERCASE_LETTER; 103 } 104 105 /* Checks if ch is a title case letter; usually upper case letters.*/ 106 U_CAPI UBool U_EXPORT2 107 u_istitle(UChar32 c) { 108 uint32_t props; 109 GET_PROPS(c, props); 110 return GET_CATEGORY(props)==U_TITLECASE_LETTER; 111 } 112 113 /* Checks if ch is a decimal digit. */ 114 U_CAPI UBool U_EXPORT2 115 u_isdigit(UChar32 c) { 116 uint32_t props; 117 GET_PROPS(c, props); 118 return GET_CATEGORY(props)==U_DECIMAL_DIGIT_NUMBER; 119 } 120 121 U_CAPI UBool U_EXPORT2 122 u_isxdigit(UChar32 c) { 123 uint32_t props; 124 125 /* check ASCII and Fullwidth ASCII a-fA-F */ 126 if( 127 (c<=0x66 && c>=0x41 && (c<=0x46 || c>=0x61)) || 128 (c>=0xff21 && c<=0xff46 && (c<=0xff26 || c>=0xff41)) 129 ) { 130 return true; 131 } 132 133 GET_PROPS(c, props); 134 return GET_CATEGORY(props)==U_DECIMAL_DIGIT_NUMBER; 135 } 136 137 /* Checks if the Unicode character is a letter.*/ 138 U_CAPI UBool U_EXPORT2 139 u_isalpha(UChar32 c) { 140 uint32_t props; 141 GET_PROPS(c, props); 142 return (CAT_MASK(props)&U_GC_L_MASK)!=0; 143 } 144 145 U_CAPI UBool U_EXPORT2 146 u_isUAlphabetic(UChar32 c) { 147 return (u_getUnicodeProperties(c, 1)&U_MASK(UPROPS_ALPHABETIC))!=0; 148 } 149 150 /* Checks if c is a letter or a decimal digit */ 151 U_CAPI UBool U_EXPORT2 152 u_isalnum(UChar32 c) { 153 uint32_t props; 154 GET_PROPS(c, props); 155 return (CAT_MASK(props)&(U_GC_L_MASK|U_GC_ND_MASK))!=0; 156 } 157 158 /** 159 * Checks if c is alphabetic, or a decimal digit; implements UCHAR_POSIX_ALNUM. 160 * @internal 161 */ 162 U_CFUNC UBool 163 u_isalnumPOSIX(UChar32 c) { 164 return u_isUAlphabetic(c) || u_isdigit(c); 165 } 166 167 /* Checks if ch is a unicode character with assigned character type.*/ 168 U_CAPI UBool U_EXPORT2 169 u_isdefined(UChar32 c) { 170 uint32_t props; 171 GET_PROPS(c, props); 172 return GET_CATEGORY(props)!=0; 173 } 174 175 /* Checks if the Unicode character is a base form character that can take a diacritic.*/ 176 U_CAPI UBool U_EXPORT2 177 u_isbase(UChar32 c) { 178 uint32_t props; 179 GET_PROPS(c, props); 180 return (CAT_MASK(props)&(U_GC_L_MASK|U_GC_N_MASK|U_GC_MC_MASK|U_GC_ME_MASK))!=0; 181 } 182 183 /* Checks if the Unicode character is a control character.*/ 184 U_CAPI UBool U_EXPORT2 185 u_iscntrl(UChar32 c) { 186 uint32_t props; 187 GET_PROPS(c, props); 188 return (CAT_MASK(props)&(U_GC_CC_MASK|U_GC_CF_MASK|U_GC_ZL_MASK|U_GC_ZP_MASK))!=0; 189 } 190 191 U_CAPI UBool U_EXPORT2 192 u_isISOControl(UChar32 c) { 193 return (uint32_t)c<=0x9f && (c<=0x1f || c>=0x7f); 194 } 195 196 /* Some control characters that are used as space. */ 197 #define IS_THAT_CONTROL_SPACE(c) \ 198 (c<=0x9f && ((c>=TAB && c<=CR) || (c>=0x1c && c <=0x1f) || c==0x85)) 199 200 /* Java has decided that U+0085 New Line is not whitespace any more. */ 201 #define IS_THAT_ASCII_CONTROL_SPACE(c) \ 202 (c<=0x1f && c>=TAB && (c<=CR || c>=0x1c)) 203 204 /* Checks if the Unicode character is a space character.*/ 205 U_CAPI UBool U_EXPORT2 206 u_isspace(UChar32 c) { 207 uint32_t props; 208 GET_PROPS(c, props); 209 return (CAT_MASK(props)&U_GC_Z_MASK)!=0 || IS_THAT_CONTROL_SPACE(c); 210 } 211 212 U_CAPI UBool U_EXPORT2 213 u_isJavaSpaceChar(UChar32 c) { 214 uint32_t props; 215 GET_PROPS(c, props); 216 return (CAT_MASK(props)&U_GC_Z_MASK)!=0; 217 } 218 219 /* Checks if the Unicode character is a whitespace character.*/ 220 U_CAPI UBool U_EXPORT2 221 u_isWhitespace(UChar32 c) { 222 uint32_t props; 223 GET_PROPS(c, props); 224 return ((CAT_MASK(props)&U_GC_Z_MASK)!=0 && 225 c!=NBSP && c!=FIGURESP && c!=NNBSP) || /* exclude no-break spaces */ 226 IS_THAT_ASCII_CONTROL_SPACE(c); 227 } 228 229 U_CAPI UBool U_EXPORT2 230 u_isblank(UChar32 c) { 231 if((uint32_t)c<=0x9f) { 232 return c==9 || c==0x20; /* TAB or SPACE */ 233 } else { 234 /* Zs */ 235 uint32_t props; 236 GET_PROPS(c, props); 237 return GET_CATEGORY(props)==U_SPACE_SEPARATOR; 238 } 239 } 240 241 U_CAPI UBool U_EXPORT2 242 u_isUWhiteSpace(UChar32 c) { 243 return (u_getUnicodeProperties(c, 1)&U_MASK(UPROPS_WHITE_SPACE))!=0; 244 } 245 246 /* Checks if the Unicode character is printable.*/ 247 U_CAPI UBool U_EXPORT2 248 u_isprint(UChar32 c) { 249 uint32_t props; 250 GET_PROPS(c, props); 251 /* comparing ==0 returns false for the categories mentioned */ 252 return (CAT_MASK(props)&U_GC_C_MASK)==0; 253 } 254 255 /** 256 * Checks if c is in \p{graph}\p{blank} - \p{cntrl}. 257 * Implements UCHAR_POSIX_PRINT. 258 * @internal 259 */ 260 U_CFUNC UBool 261 u_isprintPOSIX(UChar32 c) { 262 uint32_t props; 263 GET_PROPS(c, props); 264 /* 265 * The only cntrl character in graph+blank is TAB (in blank). 266 * Here we implement (blank-TAB)=Zs instead of calling u_isblank(). 267 */ 268 return (GET_CATEGORY(props)==U_SPACE_SEPARATOR) || u_isgraphPOSIX(c); 269 } 270 271 U_CAPI UBool U_EXPORT2 272 u_isgraph(UChar32 c) { 273 uint32_t props; 274 GET_PROPS(c, props); 275 /* comparing ==0 returns false for the categories mentioned */ 276 return (CAT_MASK(props)& 277 (U_GC_CC_MASK|U_GC_CF_MASK|U_GC_CS_MASK|U_GC_CN_MASK|U_GC_Z_MASK)) 278 ==0; 279 } 280 281 /** 282 * Checks if c is in 283 * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}] 284 * with space=\p{Whitespace} and Control=Cc. 285 * Implements UCHAR_POSIX_GRAPH. 286 * @internal 287 */ 288 U_CFUNC UBool 289 u_isgraphPOSIX(UChar32 c) { 290 uint32_t props; 291 GET_PROPS(c, props); 292 /* \p{space}\p{gc=Control} == \p{gc=Z}\p{Control} */ 293 /* comparing ==0 returns false for the categories mentioned */ 294 return (CAT_MASK(props)& 295 (U_GC_CC_MASK|U_GC_CS_MASK|U_GC_CN_MASK|U_GC_Z_MASK)) 296 ==0; 297 } 298 299 U_CAPI UBool U_EXPORT2 300 u_ispunct(UChar32 c) { 301 uint32_t props; 302 GET_PROPS(c, props); 303 return (CAT_MASK(props)&U_GC_P_MASK)!=0; 304 } 305 306 /*Checks if the Unicode character can be ignorable in a Java or Unicode identifier.*/ 307 U_CAPI UBool U_EXPORT2 308 u_isIDIgnorable(UChar32 c) { 309 if(c<=0x9f) { 310 return u_isISOControl(c) && !IS_THAT_ASCII_CONTROL_SPACE(c); 311 } else { 312 uint32_t props; 313 GET_PROPS(c, props); 314 return GET_CATEGORY(props)==U_FORMAT_CHAR; 315 } 316 } 317 318 /*Checks if the Unicode character can start a Java identifier.*/ 319 U_CAPI UBool U_EXPORT2 320 u_isJavaIDStart(UChar32 c) { 321 uint32_t props; 322 GET_PROPS(c, props); 323 return (CAT_MASK(props)&(U_GC_L_MASK|U_GC_SC_MASK|U_GC_PC_MASK))!=0; 324 } 325 326 /*Checks if the Unicode character can be a Java identifier part other than starting the 327 * identifier. 328 */ 329 U_CAPI UBool U_EXPORT2 330 u_isJavaIDPart(UChar32 c) { 331 uint32_t props; 332 GET_PROPS(c, props); 333 return (CAT_MASK(props)& 334 (U_GC_ND_MASK|U_GC_NL_MASK| 335 U_GC_L_MASK| 336 U_GC_SC_MASK|U_GC_PC_MASK| 337 U_GC_MC_MASK|U_GC_MN_MASK) 338 )!=0 || 339 u_isIDIgnorable(c); 340 } 341 342 U_CAPI int32_t U_EXPORT2 343 u_charDigitValue(UChar32 c) { 344 uint32_t props; 345 int32_t value; 346 GET_PROPS(c, props); 347 value=(int32_t)GET_NUMERIC_TYPE_VALUE(props)-UPROPS_NTV_DECIMAL_START; 348 if(value<=9) { 349 return value; 350 } else { 351 return -1; 352 } 353 } 354 355 U_CAPI double U_EXPORT2 356 u_getNumericValue(UChar32 c) { 357 uint32_t props; 358 int32_t ntv; 359 GET_PROPS(c, props); 360 ntv=(int32_t)GET_NUMERIC_TYPE_VALUE(props); 361 362 if(ntv==UPROPS_NTV_NONE) { 363 return U_NO_NUMERIC_VALUE; 364 } else if(ntv<UPROPS_NTV_DIGIT_START) { 365 /* decimal digit */ 366 return ntv-UPROPS_NTV_DECIMAL_START; 367 } else if(ntv<UPROPS_NTV_NUMERIC_START) { 368 /* other digit */ 369 return ntv-UPROPS_NTV_DIGIT_START; 370 } else if(ntv<UPROPS_NTV_FRACTION_START) { 371 /* small integer */ 372 return ntv-UPROPS_NTV_NUMERIC_START; 373 } else if(ntv<UPROPS_NTV_LARGE_START) { 374 /* fraction */ 375 int32_t numerator=(ntv>>4)-12; 376 int32_t denominator=(ntv&0xf)+1; 377 return (double)numerator/denominator; 378 } else if(ntv<UPROPS_NTV_BASE60_START) { 379 /* large, single-significant-digit integer */ 380 double numValue; 381 int32_t mant=(ntv>>5)-14; 382 int32_t exp=(ntv&0x1f)+2; 383 numValue=mant; 384 385 /* multiply by 10^exp without math.h */ 386 while(exp>=4) { 387 numValue*=10000.; 388 exp-=4; 389 } 390 switch(exp) { 391 case 3: 392 numValue*=1000.; 393 break; 394 case 2: 395 numValue*=100.; 396 break; 397 case 1: 398 numValue*=10.; 399 break; 400 case 0: 401 default: 402 break; 403 } 404 405 return numValue; 406 } else if(ntv<UPROPS_NTV_FRACTION20_START) { 407 /* sexagesimal (base 60) integer */ 408 int32_t numValue=(ntv>>2)-0xbf; 409 int32_t exp=(ntv&3)+1; 410 411 switch(exp) { 412 case 4: 413 numValue*=60*60*60*60; 414 break; 415 case 3: 416 numValue*=60*60*60; 417 break; 418 case 2: 419 numValue*=60*60; 420 break; 421 case 1: 422 numValue*=60; 423 break; 424 case 0: 425 default: 426 break; 427 } 428 429 return numValue; 430 } else if(ntv<UPROPS_NTV_FRACTION32_START) { 431 // fraction-20 e.g. 3/80 432 int32_t frac20=ntv-UPROPS_NTV_FRACTION20_START; // 0..0x17 433 int32_t numerator=2*(frac20&3)+1; 434 int32_t denominator=20<<(frac20>>2); 435 return (double)numerator/denominator; 436 } else if(ntv<UPROPS_NTV_RESERVED_START) { 437 // fraction-32 e.g. 3/64 438 int32_t frac32=ntv-UPROPS_NTV_FRACTION32_START; // 0..15 439 int32_t numerator=2*(frac32&3)+1; 440 int32_t denominator=32<<(frac32>>2); 441 return (double)numerator/denominator; 442 } else { 443 /* reserved */ 444 return U_NO_NUMERIC_VALUE; 445 } 446 } 447 448 U_CAPI int32_t U_EXPORT2 449 u_digit(UChar32 ch, int8_t radix) { 450 int8_t value; 451 if((uint8_t)(radix-2)<=(36-2)) { 452 value=(int8_t)u_charDigitValue(ch); 453 if(value<0) { 454 /* ch is not a decimal digit, try latin letters */ 455 if(ch>=0x61 && ch<=0x7A) { 456 value=(int8_t)(ch-0x57); /* ch - 'a' + 10 */ 457 } else if(ch>=0x41 && ch<=0x5A) { 458 value=(int8_t)(ch-0x37); /* ch - 'A' + 10 */ 459 } else if(ch>=0xFF41 && ch<=0xFF5A) { 460 value=(int8_t)(ch-0xFF37); /* fullwidth ASCII a-z */ 461 } else if(ch>=0xFF21 && ch<=0xFF3A) { 462 value=(int8_t)(ch-0xFF17); /* fullwidth ASCII A-Z */ 463 } 464 } 465 } else { 466 value=-1; /* invalid radix */ 467 } 468 return (int8_t)((value<radix) ? value : -1); 469 } 470 471 U_CAPI UChar32 U_EXPORT2 472 u_forDigit(int32_t digit, int8_t radix) { 473 if((uint8_t)(radix-2)>(36-2) || (uint32_t)digit>=(uint32_t)radix) { 474 return 0; 475 } else if(digit<10) { 476 return (UChar32)(0x30+digit); 477 } else { 478 return (UChar32)((0x61-10)+digit); 479 } 480 } 481 482 /* miscellaneous, and support for uprops.cpp -------------------------------- */ 483 484 U_CAPI void U_EXPORT2 485 u_getUnicodeVersion(UVersionInfo versionArray) { 486 if(versionArray!=nullptr) { 487 uprv_memcpy(versionArray, dataVersion, U_MAX_VERSION_LENGTH); 488 } 489 } 490 491 U_CFUNC uint32_t 492 u_getMainProperties(UChar32 c) { 493 uint32_t props; 494 GET_PROPS(c, props); 495 return props; 496 } 497 498 U_CFUNC uint32_t 499 u_getUnicodeProperties(UChar32 c, int32_t column) { 500 U_ASSERT(column>=0); 501 if(column>=propsVectorsColumns) { 502 return 0; 503 } else { 504 uint16_t vecIndex=UTRIE2_GET16(&propsVectorsTrie, c); 505 return propsVectors[vecIndex+column]; 506 } 507 } 508 509 U_CFUNC int32_t 510 uprv_getMaxValues(int32_t column) { 511 switch(column) { 512 case 0: 513 return indexes[UPROPS_MAX_VALUES_INDEX]; 514 case 2: 515 return indexes[UPROPS_MAX_VALUES_2_INDEX]; 516 case UPROPS_MAX_VALUES_OTHER_INDEX: 517 return indexes[column]; 518 default: 519 return 0; 520 } 521 } 522 523 U_CAPI void U_EXPORT2 524 u_charAge(UChar32 c, UVersionInfo versionArray) { 525 if(versionArray!=nullptr) { 526 uint32_t version=u_getUnicodeProperties(c, 0)>>UPROPS_AGE_SHIFT; 527 versionArray[0]=(uint8_t)(version>>2); 528 versionArray[1]=(uint8_t)(version&3); 529 versionArray[2]=versionArray[3]=0; 530 } 531 } 532 533 U_CAPI UScriptCode U_EXPORT2 534 uscript_getScript(UChar32 c, UErrorCode *pErrorCode) { 535 if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { 536 return USCRIPT_INVALID_CODE; 537 } 538 if((uint32_t)c>0x10ffff) { 539 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 540 return USCRIPT_INVALID_CODE; 541 } 542 uint32_t scriptX=u_getUnicodeProperties(c, 0)&UPROPS_SCRIPT_X_MASK; 543 uint32_t codeOrIndex=scriptX&UPROPS_MAX_SCRIPT; 544 if(scriptX<UPROPS_SCRIPT_X_WITH_COMMON) { 545 return (UScriptCode)codeOrIndex; 546 } else if(scriptX<UPROPS_SCRIPT_X_WITH_INHERITED) { 547 return USCRIPT_COMMON; 548 } else if(scriptX<UPROPS_SCRIPT_X_WITH_OTHER) { 549 return USCRIPT_INHERITED; 550 } else { 551 return (UScriptCode)scriptExtensions[codeOrIndex]; 552 } 553 } 554 555 U_CAPI UBool U_EXPORT2 556 uscript_hasScript(UChar32 c, UScriptCode sc) UPRV_NO_SANITIZE_UNDEFINED { 557 uint32_t scriptX=u_getUnicodeProperties(c, 0)&UPROPS_SCRIPT_X_MASK; 558 uint32_t codeOrIndex=scriptX&UPROPS_MAX_SCRIPT; 559 if(scriptX<UPROPS_SCRIPT_X_WITH_COMMON) { 560 return sc==(UScriptCode)codeOrIndex; 561 } 562 563 const uint16_t *scx=scriptExtensions+codeOrIndex; 564 if(scriptX>=UPROPS_SCRIPT_X_WITH_OTHER) { 565 scx=scriptExtensions+scx[1]; 566 } 567 uint32_t sc32=sc; 568 if(sc32>0x7fff) { 569 /* Guard against bogus input that would make us go past the Script_Extensions terminator. */ 570 return false; 571 } 572 while(sc32>*scx) { 573 ++scx; 574 } 575 return sc32==(*scx&0x7fff); 576 } 577 578 U_CAPI int32_t U_EXPORT2 579 uscript_getScriptExtensions(UChar32 c, 580 UScriptCode *scripts, int32_t capacity, 581 UErrorCode *pErrorCode) { 582 if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { 583 return 0; 584 } 585 if(capacity<0 || (capacity>0 && scripts==nullptr)) { 586 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 587 return 0; 588 } 589 uint32_t scriptX=u_getUnicodeProperties(c, 0)&UPROPS_SCRIPT_X_MASK; 590 uint32_t codeOrIndex=scriptX&UPROPS_MAX_SCRIPT; 591 if(scriptX<UPROPS_SCRIPT_X_WITH_COMMON) { 592 if(capacity==0) { 593 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 594 } else { 595 scripts[0]=(UScriptCode)codeOrIndex; 596 } 597 return 1; 598 } 599 600 const uint16_t *scx=scriptExtensions+codeOrIndex; 601 if(scriptX>=UPROPS_SCRIPT_X_WITH_OTHER) { 602 scx=scriptExtensions+scx[1]; 603 } 604 int32_t length=0; 605 uint16_t sx; 606 do { 607 sx=*scx++; 608 if(length<capacity) { 609 scripts[length]=(UScriptCode)(sx&0x7fff); 610 } 611 ++length; 612 } while(sx<0x8000); 613 if(length>capacity) { 614 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 615 } 616 return length; 617 } 618 619 namespace { 620 621 UBool U_CALLCONV 622 _scxRange(const void *context, UChar32 start, UChar32 end, uint32_t value) { 623 // From u_getUnicodeProperties(start, 0). 624 uint32_t vecWord = propsVectors[value]; // vecIndex=value, column 0 625 uint32_t scriptX = vecWord & UPROPS_SCRIPT_X_MASK; 626 if (scriptX >= UPROPS_SCRIPT_X_WITH_COMMON) { 627 // Code points start..end have Script_Extensions. 628 const USetAdder* sa = static_cast<const USetAdder*>(context); 629 sa->addRange(sa->set, start, end); 630 } 631 (void) value; 632 return true; 633 } 634 635 } 636 637 // for icuexportdata 638 U_CAPI void U_EXPORT2 639 uprv_addScriptExtensionsCodePoints(const USetAdder *sa, UErrorCode *pErrorCode) { 640 if(U_FAILURE(*pErrorCode)) { 641 return; 642 } 643 utrie2_enum(&propsVectorsTrie, nullptr, _scxRange, sa); 644 } 645 646 U_CAPI UBlockCode U_EXPORT2 647 ublock_getCode(UChar32 c) { 648 // We store Block values indexed by the code point shifted right 4 bits 649 // and use a "small" UCPTrie=CodePointTrie for minimal data size. 650 // This works because blocks have xxx0..xxxF ranges. 651 uint32_t c4 = c; // unsigned so that shifting right does not worry the compiler 652 // Shift unless out of range, in which case we fetch the trie's error value. 653 if (c4 <= 0x10ffff) { 654 c4 >>= 4; 655 } 656 return (UBlockCode)ucptrie_get(&block_trie, c4); 657 } 658 659 /* property starts for UnicodeSet ------------------------------------------- */ 660 661 static UBool U_CALLCONV 662 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 end, uint32_t value) { 663 /* add the start code point to the USet */ 664 const USetAdder* sa = static_cast<const USetAdder*>(context); 665 sa->add(sa->set, start); 666 (void)end; 667 (void)value; 668 return true; 669 } 670 671 #define USET_ADD_CP_AND_NEXT(sa, cp) sa->add(sa->set, cp); sa->add(sa->set, cp+1) 672 673 U_CFUNC void U_EXPORT2 674 uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) { 675 if(U_FAILURE(*pErrorCode)) { 676 return; 677 } 678 679 /* add the start code point of each same-value range of the main trie */ 680 utrie2_enum(&propsTrie, nullptr, _enumPropertyStartsRange, sa); 681 682 /* add code points with hardcoded properties, plus the ones following them */ 683 684 /* add for u_isblank() */ 685 USET_ADD_CP_AND_NEXT(sa, TAB); 686 687 /* add for IS_THAT_CONTROL_SPACE() */ 688 sa->add(sa->set, CR+1); /* range TAB..CR */ 689 sa->add(sa->set, 0x1c); 690 sa->add(sa->set, 0x1f+1); 691 USET_ADD_CP_AND_NEXT(sa, 0x85); // NEXT LINE (NEL) 692 693 /* add for u_isIDIgnorable() what was not added above */ 694 sa->add(sa->set, 0x7f); /* range DEL..NBSP-1, NBSP added below */ 695 sa->add(sa->set, HAIRSP); 696 sa->add(sa->set, RLM+1); 697 sa->add(sa->set, 0x206a); // INHIBIT SYMMETRIC SWAPPING 698 sa->add(sa->set, 0x206f+1); // NOMINAL DIGIT SHAPES 699 USET_ADD_CP_AND_NEXT(sa, ZWNBSP); 700 701 /* add no-break spaces for u_isWhitespace() what was not added above */ 702 USET_ADD_CP_AND_NEXT(sa, NBSP); 703 USET_ADD_CP_AND_NEXT(sa, FIGURESP); 704 USET_ADD_CP_AND_NEXT(sa, NNBSP); 705 706 /* add for u_digit() */ 707 sa->add(sa->set, u'a'); 708 sa->add(sa->set, u'z'+1); 709 sa->add(sa->set, u'A'); 710 sa->add(sa->set, u'Z'+1); 711 // fullwidth 712 sa->add(sa->set, u'a'); 713 sa->add(sa->set, u'z'+1); 714 sa->add(sa->set, u'A'); 715 sa->add(sa->set, u'Z'+1); 716 717 /* add for u_isxdigit() */ 718 sa->add(sa->set, u'f'+1); 719 sa->add(sa->set, u'F'+1); 720 // fullwidth 721 sa->add(sa->set, u'f'+1); 722 sa->add(sa->set, u'F'+1); 723 724 /* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */ 725 sa->add(sa->set, 0x2060); /* range 2060..206f */ 726 sa->add(sa->set, 0xfff0); 727 sa->add(sa->set, 0xfffb+1); 728 sa->add(sa->set, 0xe0000); 729 sa->add(sa->set, 0xe0fff+1); 730 731 /* add for UCHAR_GRAPHEME_BASE and others */ 732 USET_ADD_CP_AND_NEXT(sa, CGJ); 733 } 734 735 U_CFUNC void U_EXPORT2 736 upropsvec_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) { 737 if(U_FAILURE(*pErrorCode)) { 738 return; 739 } 740 741 /* add the start code point of each same-value range of the properties vectors trie */ 742 utrie2_enum(&propsVectorsTrie, nullptr, _enumPropertyStartsRange, sa); 743 } 744 745 U_CFUNC void U_EXPORT2 746 ublock_addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) { 747 // Add the start code point of each same-value range of the trie. 748 // We store Block values indexed by the code point shifted right 4 bits; 749 // see ublock_getCode(). 750 UChar32 start = 0, end; 751 uint32_t value; 752 while (start < 0x11000 && // limit: (max code point + 1) >> 4 753 (end = ucptrie_getRange(&block_trie, start, UCPMAP_RANGE_NORMAL, 0, 754 nullptr, nullptr, &value)) >= 0) { 755 sa->add(sa->set, start << 4); 756 start = end + 1; 757 } 758 }