uprops.cpp (40492B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2002-2016, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: uprops.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2002feb24 16 * created by: Markus W. Scherer 17 * 18 * Implementations for mostly non-core Unicode character properties 19 * stored in uprops.icu. 20 * 21 * With the APIs implemented here, almost all properties files and 22 * their associated implementation files are used from this file, 23 * including those for normalization and case mappings. 24 */ 25 26 #include "unicode/utypes.h" 27 #include "unicode/uchar.h" 28 #include "unicode/ucptrie.h" 29 #include "unicode/udata.h" 30 #include "unicode/unorm2.h" 31 #include "unicode/uscript.h" 32 #include "unicode/ustring.h" 33 #include "unicode/utf16.h" 34 #include "cstring.h" 35 #include "emojiprops.h" 36 #include "mutex.h" 37 #include "normalizer2impl.h" 38 #include "umutex.h" 39 #include "ubidi_props.h" 40 #include "uprops.h" 41 #include "ucase.h" 42 #include "ucln_cmn.h" 43 #include "ulayout_props.h" 44 #include "ustr_imp.h" 45 46 U_NAMESPACE_USE 47 48 // Unicode text layout properties data ----------------------------------------- 49 50 namespace { 51 52 icu::UInitOnce gLayoutInitOnce {}; 53 UDataMemory *gLayoutMemory = nullptr; 54 55 UCPTrie *gInpcTrie = nullptr; // Indic_Positional_Category 56 UCPTrie *gInscTrie = nullptr; // Indic_Syllabic_Category 57 UCPTrie *gVoTrie = nullptr; // Vertical_Orientation 58 59 int32_t gMaxInpcValue = 0; 60 int32_t gMaxInscValue = 0; 61 int32_t gMaxVoValue = 0; 62 63 UBool U_CALLCONV uprops_cleanup() { 64 udata_close(gLayoutMemory); 65 gLayoutMemory = nullptr; 66 67 ucptrie_close(gInpcTrie); 68 gInpcTrie = nullptr; 69 ucptrie_close(gInscTrie); 70 gInscTrie = nullptr; 71 ucptrie_close(gVoTrie); 72 gVoTrie = nullptr; 73 74 gMaxInpcValue = 0; 75 gMaxInscValue = 0; 76 gMaxVoValue = 0; 77 78 gLayoutInitOnce.reset(); 79 return true; 80 } 81 82 UBool U_CALLCONV 83 ulayout_isAcceptable(void * /*context*/, 84 const char * /* type */, const char * /*name*/, 85 const UDataInfo *pInfo) { 86 return pInfo->size >= 20 && 87 pInfo->isBigEndian == U_IS_BIG_ENDIAN && 88 pInfo->charsetFamily == U_CHARSET_FAMILY && 89 pInfo->dataFormat[0] == ULAYOUT_FMT_0 && 90 pInfo->dataFormat[1] == ULAYOUT_FMT_1 && 91 pInfo->dataFormat[2] == ULAYOUT_FMT_2 && 92 pInfo->dataFormat[3] == ULAYOUT_FMT_3 && 93 pInfo->formatVersion[0] == 1; 94 } 95 96 // UInitOnce singleton initialization function 97 void U_CALLCONV ulayout_load(UErrorCode &errorCode) { 98 gLayoutMemory = udata_openChoice( 99 nullptr, ULAYOUT_DATA_TYPE, ULAYOUT_DATA_NAME, 100 ulayout_isAcceptable, nullptr, &errorCode); 101 if (U_FAILURE(errorCode)) { return; } 102 103 const uint8_t* inBytes = static_cast<const uint8_t*>(udata_getMemory(gLayoutMemory)); 104 const int32_t* inIndexes = reinterpret_cast<const int32_t*>(inBytes); 105 int32_t indexesLength = inIndexes[ULAYOUT_IX_INDEXES_LENGTH]; 106 if (indexesLength < 12) { 107 errorCode = U_INVALID_FORMAT_ERROR; // Not enough indexes. 108 return; 109 } 110 int32_t offset = indexesLength * 4; 111 int32_t top = inIndexes[ULAYOUT_IX_INPC_TRIE_TOP]; 112 int32_t trieSize = top - offset; 113 if (trieSize >= 16) { 114 gInpcTrie = ucptrie_openFromBinary( 115 UCPTRIE_TYPE_ANY, UCPTRIE_VALUE_BITS_ANY, 116 inBytes + offset, trieSize, nullptr, &errorCode); 117 } 118 offset = top; 119 top = inIndexes[ULAYOUT_IX_INSC_TRIE_TOP]; 120 trieSize = top - offset; 121 if (trieSize >= 16) { 122 gInscTrie = ucptrie_openFromBinary( 123 UCPTRIE_TYPE_ANY, UCPTRIE_VALUE_BITS_ANY, 124 inBytes + offset, trieSize, nullptr, &errorCode); 125 } 126 offset = top; 127 top = inIndexes[ULAYOUT_IX_VO_TRIE_TOP]; 128 trieSize = top - offset; 129 if (trieSize >= 16) { 130 gVoTrie = ucptrie_openFromBinary( 131 UCPTRIE_TYPE_ANY, UCPTRIE_VALUE_BITS_ANY, 132 inBytes + offset, trieSize, nullptr, &errorCode); 133 } 134 135 uint32_t maxValues = inIndexes[ULAYOUT_IX_MAX_VALUES]; 136 gMaxInpcValue = maxValues >> ULAYOUT_MAX_INPC_SHIFT; 137 gMaxInscValue = (maxValues >> ULAYOUT_MAX_INSC_SHIFT) & 0xff; 138 gMaxVoValue = (maxValues >> ULAYOUT_MAX_VO_SHIFT) & 0xff; 139 140 ucln_common_registerCleanup(UCLN_COMMON_UPROPS, uprops_cleanup); 141 } 142 143 UBool ulayout_ensureData(UErrorCode &errorCode) { 144 if (U_FAILURE(errorCode)) { return false; } 145 umtx_initOnce(gLayoutInitOnce, &ulayout_load, errorCode); 146 return U_SUCCESS(errorCode); 147 } 148 149 UBool ulayout_ensureData() { 150 UErrorCode errorCode = U_ZERO_ERROR; 151 return ulayout_ensureData(errorCode); 152 } 153 154 } // namespace 155 156 /* general properties API functions ----------------------------------------- */ 157 158 struct BinaryProperty; 159 160 typedef UBool BinaryPropertyContains(const BinaryProperty &prop, UChar32 c, UProperty which); 161 162 struct BinaryProperty { 163 int32_t column; // SRC_PROPSVEC column, or "source" if mask==0 164 uint32_t mask; 165 BinaryPropertyContains *contains; 166 }; 167 168 static UBool defaultContains(const BinaryProperty &prop, UChar32 c, UProperty /*which*/) { 169 /* systematic, directly stored properties */ 170 return (u_getUnicodeProperties(c, prop.column)&prop.mask)!=0; 171 } 172 173 static UBool caseBinaryPropertyContains(const BinaryProperty &/*prop*/, UChar32 c, UProperty which) { 174 return ucase_hasBinaryProperty(c, which); 175 } 176 177 static UBool isBidiControl(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { 178 return ubidi_isBidiControl(c); 179 } 180 181 static UBool isMirrored(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { 182 return ubidi_isMirrored(c); 183 } 184 185 static UBool isJoinControl(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { 186 return ubidi_isJoinControl(c); 187 } 188 189 #if UCONFIG_NO_NORMALIZATION 190 static UBool hasFullCompositionExclusion(const BinaryProperty &, UChar32, UProperty) { 191 return false; 192 } 193 #else 194 static UBool hasFullCompositionExclusion(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { 195 // By definition, Full_Composition_Exclusion is the same as NFC_QC=No. 196 UErrorCode errorCode=U_ZERO_ERROR; 197 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode); 198 return U_SUCCESS(errorCode) && impl->isCompNo(impl->getNorm16(c)); 199 } 200 #endif 201 202 // UCHAR_NF*_INERT properties 203 #if UCONFIG_NO_NORMALIZATION 204 static UBool isNormInert(const BinaryProperty &, UChar32, UProperty) { 205 return false; 206 } 207 #else 208 static UBool isNormInert(const BinaryProperty &/*prop*/, UChar32 c, UProperty which) { 209 UErrorCode errorCode=U_ZERO_ERROR; 210 const Normalizer2 *norm2=Normalizer2Factory::getInstance( 211 static_cast<UNormalizationMode>(which - UCHAR_NFD_INERT + UNORM_NFD), errorCode); 212 return U_SUCCESS(errorCode) && norm2->isInert(c); 213 } 214 #endif 215 216 #if UCONFIG_NO_NORMALIZATION 217 static UBool changesWhenCasefolded(const BinaryProperty &, UChar32, UProperty) { 218 return false; 219 } 220 #else 221 static UBool changesWhenCasefolded(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { 222 UnicodeString nfd; 223 UErrorCode errorCode=U_ZERO_ERROR; 224 const Normalizer2 *nfcNorm2=Normalizer2::getNFCInstance(errorCode); 225 if(U_FAILURE(errorCode)) { 226 return false; 227 } 228 if(nfcNorm2->getDecomposition(c, nfd)) { 229 /* c has a decomposition */ 230 if(nfd.length()==1) { 231 c=nfd[0]; /* single BMP code point */ 232 } else if(nfd.length()<=U16_MAX_LENGTH && 233 nfd.length()==U16_LENGTH(c=nfd.char32At(0)) 234 ) { 235 /* single supplementary code point */ 236 } else { 237 c=U_SENTINEL; 238 } 239 } else if(c<0) { 240 return false; /* protect against bad input */ 241 } 242 if(c>=0) { 243 /* single code point */ 244 const char16_t *resultString; 245 return ucase_toFullFolding(c, &resultString, U_FOLD_CASE_DEFAULT) >= 0; 246 } else { 247 /* guess some large but stack-friendly capacity */ 248 char16_t dest[2*UCASE_MAX_STRING_LENGTH]; 249 int32_t destLength; 250 destLength=u_strFoldCase(dest, UPRV_LENGTHOF(dest), 251 nfd.getBuffer(), nfd.length(), 252 U_FOLD_CASE_DEFAULT, &errorCode); 253 return U_SUCCESS(errorCode) && 254 0!=u_strCompare(nfd.getBuffer(), nfd.length(), 255 dest, destLength, false); 256 } 257 } 258 #endif 259 260 #if UCONFIG_NO_NORMALIZATION 261 static UBool changesWhenNFKC_Casefolded(const BinaryProperty &, UChar32, UProperty) { 262 return false; 263 } 264 #else 265 static UBool changesWhenNFKC_Casefolded(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { 266 UErrorCode errorCode=U_ZERO_ERROR; 267 const Normalizer2Impl *kcf=Normalizer2Factory::getNFKC_CFImpl(errorCode); 268 if(U_FAILURE(errorCode)) { 269 return false; 270 } 271 UnicodeString src(c); 272 UnicodeString dest; 273 { 274 // The ReorderingBuffer must be in a block because its destructor 275 // needs to release dest's buffer before we look at its contents. 276 ReorderingBuffer buffer(*kcf, dest); 277 // Small destCapacity for NFKC_CF(c). 278 if(buffer.init(5, errorCode)) { 279 const char16_t *srcArray=src.getBuffer(); 280 kcf->compose(srcArray, srcArray+src.length(), false, 281 true, buffer, errorCode); 282 } 283 } 284 return U_SUCCESS(errorCode) && dest!=src; 285 } 286 #endif 287 288 #if UCONFIG_NO_NORMALIZATION 289 static UBool isCanonSegmentStarter(const BinaryProperty &, UChar32, UProperty) { 290 return false; 291 } 292 #else 293 static UBool isCanonSegmentStarter(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { 294 UErrorCode errorCode=U_ZERO_ERROR; 295 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode); 296 return 297 U_SUCCESS(errorCode) && impl->ensureCanonIterData(errorCode) && 298 impl->isCanonSegmentStarter(c); 299 } 300 #endif 301 302 static UBool isPOSIX_alnum(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { 303 return u_isalnumPOSIX(c); 304 } 305 306 static UBool isPOSIX_blank(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { 307 return u_isblank(c); 308 } 309 310 static UBool isPOSIX_graph(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { 311 return u_isgraphPOSIX(c); 312 } 313 314 static UBool isPOSIX_print(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { 315 return u_isprintPOSIX(c); 316 } 317 318 static UBool isPOSIX_xdigit(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { 319 return u_isxdigit(c); 320 } 321 322 static UBool isRegionalIndicator(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { 323 // Property starts are a subset of lb=RI etc. 324 return 0x1F1E6<=c && c<=0x1F1FF; 325 } 326 327 static UBool hasEmojiProperty(const BinaryProperty &/*prop*/, UChar32 c, UProperty which) { 328 return EmojiProps::hasBinaryProperty(c, which); 329 } 330 331 static UBool isIDSUnaryOperator(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { 332 // New in Unicode 15.1 for just two characters. 333 return 0x2FFE<=c && c<=0x2FFF; 334 } 335 336 /** Ranges (start/limit pairs) of ID_Compat_Math_Continue (only), from UCD PropList.txt. */ 337 static constexpr UChar32 ID_COMPAT_MATH_CONTINUE[] = { 338 0x00B2, 0x00B3 + 1, 339 0x00B9, 0x00B9 + 1, 340 0x2070, 0x2070 + 1, 341 0x2074, 0x207E + 1, 342 0x2080, 0x208E + 1 343 }; 344 345 /** ID_Compat_Math_Start characters, from UCD PropList.txt. */ 346 static constexpr UChar32 ID_COMPAT_MATH_START[] = { 347 0x2202, 348 0x2207, 349 0x221E, 350 0x1D6C1, 351 0x1D6DB, 352 0x1D6FB, 353 0x1D715, 354 0x1D735, 355 0x1D74F, 356 0x1D76F, 357 0x1D789, 358 0x1D7A9, 359 0x1D7C3 360 }; 361 362 /** Ranges (start/limit pairs) of Modifier_Combining_mark (only), from UCD PropList.txt. */ 363 static constexpr UChar32 MODIFIER_COMBINING_MARK[] = { 364 0x0654, 0x0655 + 1, 365 0x0658, 0x0658 + 1, // U+0658 366 0x06DC, 0x06DC + 1, // U+06DC 367 0x06E3, 0x06E3 + 1, // U+06E3 368 0x06E7, 0x06E8 + 1, 369 0x08CA, 0x08CB + 1, 370 0x08CD, 0x08CF + 1, 371 0x08D3, 0x08D3 + 1, // U+08D3 372 0x08F3, 0x08F3 + 1 // U+08F3 373 }; 374 375 static UBool isIDCompatMathStart(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { 376 if (c < ID_COMPAT_MATH_START[0]) { return false; } // fastpath for common scripts 377 for (UChar32 startChar : ID_COMPAT_MATH_START) { 378 if (c == startChar) { return true; } 379 } 380 return false; 381 } 382 383 static UBool isIDCompatMathContinue(const BinaryProperty &prop, UChar32 c, UProperty /*which*/) { 384 for (int32_t i = 0; i < UPRV_LENGTHOF(ID_COMPAT_MATH_CONTINUE); i += 2) { 385 if (c < ID_COMPAT_MATH_CONTINUE[i]) { return false; } // below range start 386 if (c < ID_COMPAT_MATH_CONTINUE[i + 1]) { return true; } // below range limit 387 } 388 return isIDCompatMathStart(prop, c, UCHAR_ID_COMPAT_MATH_START); 389 } 390 391 static UBool isModifierCombiningMark(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { 392 for (int32_t i = 0; i < UPRV_LENGTHOF(MODIFIER_COMBINING_MARK); i += 2) { 393 if (c < MODIFIER_COMBINING_MARK[i]) { return false; } // below range start 394 if (c < MODIFIER_COMBINING_MARK[i + 1]) { return true; } // below range limit 395 } 396 return false; 397 } 398 399 static const BinaryProperty binProps[UCHAR_BINARY_LIMIT]={ 400 /* 401 * column and mask values for binary properties from u_getUnicodeProperties(). 402 * Must be in order of corresponding UProperty, 403 * and there must be exactly one entry per binary UProperty. 404 * 405 * Properties with mask==0 are handled in code. 406 * For them, column is the UPropertySource value. 407 */ 408 { 1, U_MASK(UPROPS_ALPHABETIC), defaultContains }, 409 { 1, U_MASK(UPROPS_ASCII_HEX_DIGIT), defaultContains }, 410 { UPROPS_SRC_BIDI, 0, isBidiControl }, 411 { UPROPS_SRC_BIDI, 0, isMirrored }, 412 { 1, U_MASK(UPROPS_DASH), defaultContains }, 413 { 1, U_MASK(UPROPS_DEFAULT_IGNORABLE_CODE_POINT), defaultContains }, 414 { 1, U_MASK(UPROPS_DEPRECATED), defaultContains }, 415 { 1, U_MASK(UPROPS_DIACRITIC), defaultContains }, 416 { 1, U_MASK(UPROPS_EXTENDER), defaultContains }, 417 { UPROPS_SRC_NFC, 0, hasFullCompositionExclusion }, 418 { 1, U_MASK(UPROPS_GRAPHEME_BASE), defaultContains }, 419 { 1, U_MASK(UPROPS_GRAPHEME_EXTEND), defaultContains }, 420 { 1, U_MASK(UPROPS_GRAPHEME_LINK), defaultContains }, 421 { 1, U_MASK(UPROPS_HEX_DIGIT), defaultContains }, 422 { 1, U_MASK(UPROPS_HYPHEN), defaultContains }, 423 { 1, U_MASK(UPROPS_ID_CONTINUE), defaultContains }, 424 { 1, U_MASK(UPROPS_ID_START), defaultContains }, 425 { 1, U_MASK(UPROPS_IDEOGRAPHIC), defaultContains }, 426 { 1, U_MASK(UPROPS_IDS_BINARY_OPERATOR), defaultContains }, 427 { 1, U_MASK(UPROPS_IDS_TRINARY_OPERATOR), defaultContains }, 428 { UPROPS_SRC_BIDI, 0, isJoinControl }, 429 { 1, U_MASK(UPROPS_LOGICAL_ORDER_EXCEPTION), defaultContains }, 430 { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_LOWERCASE 431 { 1, U_MASK(UPROPS_MATH), defaultContains }, 432 { 1, U_MASK(UPROPS_NONCHARACTER_CODE_POINT), defaultContains }, 433 { 1, U_MASK(UPROPS_QUOTATION_MARK), defaultContains }, 434 { 1, U_MASK(UPROPS_RADICAL), defaultContains }, 435 { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_SOFT_DOTTED 436 { 1, U_MASK(UPROPS_TERMINAL_PUNCTUATION), defaultContains }, 437 { 1, U_MASK(UPROPS_UNIFIED_IDEOGRAPH), defaultContains }, 438 { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_UPPERCASE 439 { 1, U_MASK(UPROPS_WHITE_SPACE), defaultContains }, 440 { 1, U_MASK(UPROPS_XID_CONTINUE), defaultContains }, 441 { 1, U_MASK(UPROPS_XID_START), defaultContains }, 442 { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CASE_SENSITIVE 443 { 1, U_MASK(UPROPS_S_TERM), defaultContains }, 444 { 1, U_MASK(UPROPS_VARIATION_SELECTOR), defaultContains }, 445 { UPROPS_SRC_NFC, 0, isNormInert }, // UCHAR_NFD_INERT 446 { UPROPS_SRC_NFKC, 0, isNormInert }, // UCHAR_NFKD_INERT 447 { UPROPS_SRC_NFC, 0, isNormInert }, // UCHAR_NFC_INERT 448 { UPROPS_SRC_NFKC, 0, isNormInert }, // UCHAR_NFKC_INERT 449 { UPROPS_SRC_NFC_CANON_ITER, 0, isCanonSegmentStarter }, 450 { 1, U_MASK(UPROPS_PATTERN_SYNTAX), defaultContains }, 451 { 1, U_MASK(UPROPS_PATTERN_WHITE_SPACE), defaultContains }, 452 { UPROPS_SRC_CHAR_AND_PROPSVEC, 0, isPOSIX_alnum }, 453 { UPROPS_SRC_CHAR, 0, isPOSIX_blank }, 454 { UPROPS_SRC_CHAR, 0, isPOSIX_graph }, 455 { UPROPS_SRC_CHAR, 0, isPOSIX_print }, 456 { UPROPS_SRC_CHAR, 0, isPOSIX_xdigit }, 457 { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CASED 458 { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CASE_IGNORABLE 459 { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_LOWERCASED 460 { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_UPPERCASED 461 { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_TITLECASED 462 { UPROPS_SRC_CASE_AND_NORM, 0, changesWhenCasefolded }, 463 { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_CASEMAPPED 464 { UPROPS_SRC_NFKC_CF, 0, changesWhenNFKC_Casefolded }, 465 { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EMOJI 466 { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EMOJI_PRESENTATION 467 { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EMOJI_MODIFIER 468 { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EMOJI_MODIFIER_BASE 469 { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EMOJI_COMPONENT 470 { 2, 0, isRegionalIndicator }, 471 { 1, U_MASK(UPROPS_PREPENDED_CONCATENATION_MARK), defaultContains }, 472 { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EXTENDED_PICTOGRAPHIC 473 { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_BASIC_EMOJI 474 { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EMOJI_KEYCAP_SEQUENCE 475 { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE 476 { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_RGI_EMOJI_FLAG_SEQUENCE 477 { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_RGI_EMOJI_TAG_SEQUENCE 478 { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_RGI_EMOJI_ZWJ_SEQUENCE 479 { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_RGI_EMOJI 480 { UPROPS_SRC_IDSU, 0, isIDSUnaryOperator }, // UCHAR_IDS_UNARY_OPERATOR 481 { UPROPS_SRC_ID_COMPAT_MATH, 0, isIDCompatMathStart }, // UCHAR_ID_COMPAT_MATH_START 482 { UPROPS_SRC_ID_COMPAT_MATH, 0, isIDCompatMathContinue }, // UCHAR_ID_COMPAT_MATH_CONTINUE 483 { UPROPS_SRC_MCM, 0 , isModifierCombiningMark }, // UCHAR_MODIFIER_COMBINING_MARK 484 }; 485 486 U_CAPI UBool U_EXPORT2 487 u_hasBinaryProperty(UChar32 c, UProperty which) { 488 /* c is range-checked in the functions that are called from here */ 489 if(which<UCHAR_BINARY_START || UCHAR_BINARY_LIMIT<=which) { 490 /* not a known binary property */ 491 return false; 492 } else { 493 const BinaryProperty &prop=binProps[which]; 494 return prop.contains(prop, c, which); 495 } 496 } 497 498 /* Checks if the Unicode character can start a Unicode identifier.*/ 499 U_CAPI UBool U_EXPORT2 500 u_isIDStart(UChar32 c) { 501 return u_hasBinaryProperty(c, UCHAR_ID_START); 502 } 503 504 /* Checks if the Unicode character can be a Unicode identifier part other than starting the 505 identifier.*/ 506 U_CAPI UBool U_EXPORT2 507 u_isIDPart(UChar32 c) { 508 return u_hasBinaryProperty(c, UCHAR_ID_CONTINUE); 509 } 510 511 U_CAPI UBool U_EXPORT2 512 u_stringHasBinaryProperty(const char16_t *s, int32_t length, UProperty which) { 513 if (s == nullptr && length != 0) { return false; } 514 if (length == 1) { 515 return u_hasBinaryProperty(s[0], which); // single code point 516 } else if (length == 2 || (length < 0 && *s != 0)) { // not empty string 517 // first code point 518 int32_t i = 0; 519 UChar32 c; 520 U16_NEXT(s, i, length, c); 521 if (length > 0 ? i == length : s[i] == 0) { 522 return u_hasBinaryProperty(c, which); // single code point 523 } 524 } 525 // Only call into EmojiProps for a relevant property, 526 // so that we not unnecessarily try to load its data file. 527 return UCHAR_BASIC_EMOJI <= which && which <= UCHAR_RGI_EMOJI && 528 EmojiProps::hasBinaryProperty(s, length, which); 529 } 530 531 struct IntProperty; 532 533 typedef int32_t IntPropertyGetValue(const IntProperty &prop, UChar32 c, UProperty which); 534 typedef int32_t IntPropertyGetMaxValue(const IntProperty &prop, UProperty which); 535 536 struct IntProperty { 537 int32_t column; // SRC_PROPSVEC column, or "source" if mask==0 538 uint32_t mask; 539 int32_t shift; // =maxValue if getMaxValueFromShift() is used 540 IntPropertyGetValue *getValue; 541 IntPropertyGetMaxValue *getMaxValue; 542 }; 543 544 static int32_t defaultGetValue(const IntProperty &prop, UChar32 c, UProperty /*which*/) { 545 /* systematic, directly stored properties */ 546 return static_cast<int32_t>(u_getUnicodeProperties(c, prop.column) & prop.mask) >> prop.shift; 547 } 548 549 static int32_t defaultGetMaxValue(const IntProperty &prop, UProperty /*which*/) { 550 return (uprv_getMaxValues(prop.column)&prop.mask)>>prop.shift; 551 } 552 553 static int32_t getMaxValueFromShift(const IntProperty &prop, UProperty /*which*/) { 554 return prop.shift; 555 } 556 557 static int32_t getBiDiClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { 558 return static_cast<int32_t>(u_charDirection(c)); 559 } 560 561 static int32_t getBiDiPairedBracketType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { 562 return static_cast<int32_t>(ubidi_getPairedBracketType(c)); 563 } 564 565 static int32_t biDiGetMaxValue(const IntProperty &/*prop*/, UProperty which) { 566 return ubidi_getMaxValue(which); 567 } 568 569 static int32_t getBlock(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { 570 return static_cast<int32_t>(ublock_getCode(c)); 571 } 572 573 static int32_t blockGetMaxValue(const IntProperty &/*prop*/, UProperty /*which*/) { 574 return uprv_getMaxValues(UPROPS_MAX_VALUES_OTHER_INDEX) & UPROPS_MAX_BLOCK; 575 } 576 577 #if UCONFIG_NO_NORMALIZATION 578 static int32_t getCombiningClass(const IntProperty &, UChar32, UProperty) { 579 return 0; 580 } 581 #else 582 static int32_t getCombiningClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { 583 return u_getCombiningClass(c); 584 } 585 #endif 586 587 static int32_t getGeneralCategory(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { 588 return static_cast<int32_t>(u_charType(c)); 589 } 590 591 static int32_t getJoiningGroup(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { 592 return ubidi_getJoiningGroup(c); 593 } 594 595 static int32_t getJoiningType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { 596 return ubidi_getJoiningType(c); 597 } 598 599 static int32_t getNumericType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { 600 int32_t ntv = static_cast<int32_t>(GET_NUMERIC_TYPE_VALUE(u_getMainProperties(c))); 601 return UPROPS_NTV_GET_TYPE(ntv); 602 } 603 604 static int32_t getScript(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { 605 UErrorCode errorCode=U_ZERO_ERROR; 606 return static_cast<int32_t>(uscript_getScript(c, &errorCode)); 607 } 608 609 static int32_t scriptGetMaxValue(const IntProperty &/*prop*/, UProperty /*which*/) { 610 return uprv_getMaxValues(0)&UPROPS_MAX_SCRIPT; 611 } 612 613 /* 614 * Map some of the Grapheme Cluster Break values to Hangul Syllable Types. 615 * Hangul_Syllable_Type used to be fully redundant with a subset of Grapheme_Cluster_Break. 616 * 617 * Starting with Unicode 16, this is no longer true for HST=V vs. GCB=V in some cases: 618 * Some Kirat Rai vowels are given GCB=V for proper grapheme clustering, but 619 * they are of course not related to Hangul syllables. 620 */ 621 static const UHangulSyllableType gcbToHst[]={ 622 U_HST_NOT_APPLICABLE, /* U_GCB_OTHER */ 623 U_HST_NOT_APPLICABLE, /* U_GCB_CONTROL */ 624 U_HST_NOT_APPLICABLE, /* U_GCB_CR */ 625 U_HST_NOT_APPLICABLE, /* U_GCB_EXTEND */ 626 U_HST_LEADING_JAMO, /* U_GCB_L */ 627 U_HST_NOT_APPLICABLE, /* U_GCB_LF */ 628 U_HST_LV_SYLLABLE, /* U_GCB_LV */ 629 U_HST_LVT_SYLLABLE, /* U_GCB_LVT */ 630 U_HST_TRAILING_JAMO, /* U_GCB_T */ 631 U_HST_VOWEL_JAMO /* U_GCB_V */ 632 /* 633 * Omit GCB values beyond what we need for hst. 634 * The code below checks for the array length. 635 */ 636 }; 637 638 static int32_t getHangulSyllableType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { 639 // Ignore supplementary code points: They all have HST=NA. 640 // This is a simple way to handle the GCB!=hst cases since Unicode 16 (Kirat Rai vowels). 641 if(c>0xffff) { 642 return U_HST_NOT_APPLICABLE; 643 } 644 /* see comments on gcbToHst[] above */ 645 int32_t gcb = static_cast<int32_t>(u_getUnicodeProperties(c, 2) & UPROPS_GCB_MASK) >> UPROPS_GCB_SHIFT; 646 if(gcb<UPRV_LENGTHOF(gcbToHst)) { 647 return gcbToHst[gcb]; 648 } else { 649 return U_HST_NOT_APPLICABLE; 650 } 651 } 652 653 #if UCONFIG_NO_NORMALIZATION 654 static int32_t getNormQuickCheck(const IntProperty &, UChar32, UProperty) { 655 return 0; 656 } 657 #else 658 static int32_t getNormQuickCheck(const IntProperty &/*prop*/, UChar32 c, UProperty which) { 659 return static_cast<int32_t>(unorm_getQuickCheck(c, static_cast<UNormalizationMode>(which - UCHAR_NFD_QUICK_CHECK + UNORM_NFD))); 660 } 661 #endif 662 663 #if UCONFIG_NO_NORMALIZATION 664 static int32_t getLeadCombiningClass(const IntProperty &, UChar32, UProperty) { 665 return 0; 666 } 667 #else 668 static int32_t getLeadCombiningClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { 669 return unorm_getFCD16(c)>>8; 670 } 671 #endif 672 673 #if UCONFIG_NO_NORMALIZATION 674 static int32_t getTrailCombiningClass(const IntProperty &, UChar32, UProperty) { 675 return 0; 676 } 677 #else 678 static int32_t getTrailCombiningClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { 679 return unorm_getFCD16(c)&0xff; 680 } 681 #endif 682 683 static int32_t getInPC(const IntProperty &, UChar32 c, UProperty) { 684 return ulayout_ensureData() && gInpcTrie != nullptr ? ucptrie_get(gInpcTrie, c) : 0; 685 } 686 687 static int32_t getInSC(const IntProperty &, UChar32 c, UProperty) { 688 return ulayout_ensureData() && gInscTrie != nullptr ? ucptrie_get(gInscTrie, c) : 0; 689 } 690 691 static int32_t getVo(const IntProperty &, UChar32 c, UProperty) { 692 return ulayout_ensureData() && gVoTrie != nullptr ? ucptrie_get(gVoTrie, c) : 0; 693 } 694 695 static int32_t layoutGetMaxValue(const IntProperty &/*prop*/, UProperty which) { 696 if (!ulayout_ensureData()) { return 0; } 697 switch (which) { 698 case UCHAR_INDIC_POSITIONAL_CATEGORY: 699 return gMaxInpcValue; 700 case UCHAR_INDIC_SYLLABIC_CATEGORY: 701 return gMaxInscValue; 702 case UCHAR_VERTICAL_ORIENTATION: 703 return gMaxVoValue; 704 default: 705 return 0; 706 } 707 } 708 709 static int32_t getIDStatusValue(const IntProperty & /*prop*/, UChar32 c, UProperty /*which*/) { 710 uint32_t value = u_getUnicodeProperties(c, 2) >> UPROPS_2_ID_TYPE_SHIFT; 711 return value >= UPROPS_ID_TYPE_ALLOWED_MIN ? U_ID_STATUS_ALLOWED : U_ID_STATUS_RESTRICTED; 712 } 713 714 static const IntProperty intProps[UCHAR_INT_LIMIT-UCHAR_INT_START]={ 715 /* 716 * column, mask and shift values for int-value properties from u_getUnicodeProperties(). 717 * Must be in order of corresponding UProperty, 718 * and there must be exactly one entry per int UProperty. 719 * 720 * Properties with mask==0 are handled in code. 721 * For them, column is the UPropertySource value. 722 */ 723 { UPROPS_SRC_BIDI, 0, 0, getBiDiClass, biDiGetMaxValue }, 724 { UPROPS_SRC_BLOCK, 0, 0, getBlock, blockGetMaxValue }, 725 { UPROPS_SRC_NFC, 0, 0xff, getCombiningClass, getMaxValueFromShift }, 726 { 2, UPROPS_DT_MASK, 0, defaultGetValue, defaultGetMaxValue }, 727 { 0, UPROPS_EA_MASK, UPROPS_EA_SHIFT, defaultGetValue, defaultGetMaxValue }, 728 { UPROPS_SRC_CHAR, 0, static_cast<int32_t>(U_CHAR_CATEGORY_COUNT) - 1, getGeneralCategory, getMaxValueFromShift }, 729 { UPROPS_SRC_BIDI, 0, 0, getJoiningGroup, biDiGetMaxValue }, 730 { UPROPS_SRC_BIDI, 0, 0, getJoiningType, biDiGetMaxValue }, 731 { 2, UPROPS_LB_MASK, UPROPS_LB_SHIFT, defaultGetValue, defaultGetMaxValue }, 732 { UPROPS_SRC_CHAR, 0, static_cast<int32_t>(U_NT_COUNT) - 1, getNumericType, getMaxValueFromShift }, 733 { UPROPS_SRC_PROPSVEC, 0, 0, getScript, scriptGetMaxValue }, 734 { UPROPS_SRC_PROPSVEC, 0, static_cast<int32_t>(U_HST_COUNT) - 1, getHangulSyllableType, getMaxValueFromShift }, 735 // UCHAR_NFD_QUICK_CHECK: max=1=YES -- never "maybe", only "no" or "yes" 736 { UPROPS_SRC_NFC, 0, static_cast<int32_t>(UNORM_YES), getNormQuickCheck, getMaxValueFromShift }, 737 // UCHAR_NFKD_QUICK_CHECK: max=1=YES -- never "maybe", only "no" or "yes" 738 { UPROPS_SRC_NFKC, 0, static_cast<int32_t>(UNORM_YES), getNormQuickCheck, getMaxValueFromShift }, 739 // UCHAR_NFC_QUICK_CHECK: max=2=MAYBE 740 { UPROPS_SRC_NFC, 0, static_cast<int32_t>(UNORM_MAYBE), getNormQuickCheck, getMaxValueFromShift }, 741 // UCHAR_NFKC_QUICK_CHECK: max=2=MAYBE 742 { UPROPS_SRC_NFKC, 0, static_cast<int32_t>(UNORM_MAYBE), getNormQuickCheck, getMaxValueFromShift }, 743 { UPROPS_SRC_NFC, 0, 0xff, getLeadCombiningClass, getMaxValueFromShift }, 744 { UPROPS_SRC_NFC, 0, 0xff, getTrailCombiningClass, getMaxValueFromShift }, 745 { 2, UPROPS_GCB_MASK, UPROPS_GCB_SHIFT, defaultGetValue, defaultGetMaxValue }, 746 { 2, UPROPS_SB_MASK, UPROPS_SB_SHIFT, defaultGetValue, defaultGetMaxValue }, 747 { 2, UPROPS_WB_MASK, UPROPS_WB_SHIFT, defaultGetValue, defaultGetMaxValue }, 748 { UPROPS_SRC_BIDI, 0, 0, getBiDiPairedBracketType, biDiGetMaxValue }, 749 { UPROPS_SRC_INPC, 0, 0, getInPC, layoutGetMaxValue }, 750 { UPROPS_SRC_INSC, 0, 0, getInSC, layoutGetMaxValue }, 751 { UPROPS_SRC_VO, 0, 0, getVo, layoutGetMaxValue }, 752 { UPROPS_SRC_PROPSVEC, 0, static_cast<int32_t>(U_ID_STATUS_ALLOWED), getIDStatusValue, getMaxValueFromShift }, 753 { 0, UPROPS_INCB_MASK, UPROPS_INCB_SHIFT,defaultGetValue, defaultGetMaxValue }, 754 }; 755 756 U_CAPI int32_t U_EXPORT2 757 u_getIntPropertyValue(UChar32 c, UProperty which) { 758 if(which<UCHAR_INT_START) { 759 if(UCHAR_BINARY_START<=which && which<UCHAR_BINARY_LIMIT) { 760 const BinaryProperty &prop=binProps[which]; 761 return prop.contains(prop, c, which); 762 } 763 } else if(which<UCHAR_INT_LIMIT) { 764 const IntProperty &prop=intProps[which-UCHAR_INT_START]; 765 return prop.getValue(prop, c, which); 766 } else if(which==UCHAR_GENERAL_CATEGORY_MASK) { 767 return U_MASK(u_charType(c)); 768 } 769 return 0; // undefined 770 } 771 772 U_CAPI int32_t U_EXPORT2 773 u_getIntPropertyMinValue(UProperty /*which*/) { 774 return 0; /* all binary/enum/int properties have a minimum value of 0 */ 775 } 776 777 U_CAPI int32_t U_EXPORT2 778 u_getIntPropertyMaxValue(UProperty which) { 779 if(which<UCHAR_INT_START) { 780 if(UCHAR_BINARY_START<=which && which<UCHAR_BINARY_LIMIT) { 781 return 1; // maximum true for all binary properties 782 } 783 } else if(which<UCHAR_INT_LIMIT) { 784 const IntProperty &prop=intProps[which-UCHAR_INT_START]; 785 return prop.getMaxValue(prop, which); 786 } 787 return -1; // undefined 788 } 789 790 U_CFUNC UPropertySource U_EXPORT2 791 uprops_getSource(UProperty which) { 792 if(which<UCHAR_BINARY_START) { 793 return UPROPS_SRC_NONE; /* undefined */ 794 } else if(which<UCHAR_BINARY_LIMIT) { 795 const BinaryProperty &prop=binProps[which]; 796 if(prop.mask!=0) { 797 return UPROPS_SRC_PROPSVEC; 798 } else { 799 return (UPropertySource)prop.column; 800 } 801 } else if(which<UCHAR_INT_START) { 802 return UPROPS_SRC_NONE; /* undefined */ 803 } else if(which<UCHAR_INT_LIMIT) { 804 const IntProperty &prop=intProps[which-UCHAR_INT_START]; 805 if(prop.mask!=0) { 806 return UPROPS_SRC_PROPSVEC; 807 } else { 808 return (UPropertySource)prop.column; 809 } 810 } else if(which<UCHAR_STRING_START) { 811 switch(which) { 812 case UCHAR_GENERAL_CATEGORY_MASK: 813 case UCHAR_NUMERIC_VALUE: 814 return UPROPS_SRC_CHAR; 815 816 default: 817 return UPROPS_SRC_NONE; 818 } 819 } else if(which<UCHAR_STRING_LIMIT) { 820 switch(which) { 821 case UCHAR_AGE: 822 return UPROPS_SRC_PROPSVEC; 823 824 case UCHAR_BIDI_MIRRORING_GLYPH: 825 return UPROPS_SRC_BIDI; 826 827 case UCHAR_CASE_FOLDING: 828 case UCHAR_LOWERCASE_MAPPING: 829 case UCHAR_SIMPLE_CASE_FOLDING: 830 case UCHAR_SIMPLE_LOWERCASE_MAPPING: 831 case UCHAR_SIMPLE_TITLECASE_MAPPING: 832 case UCHAR_SIMPLE_UPPERCASE_MAPPING: 833 case UCHAR_TITLECASE_MAPPING: 834 case UCHAR_UPPERCASE_MAPPING: 835 return UPROPS_SRC_CASE; 836 837 case UCHAR_ISO_COMMENT: 838 case UCHAR_NAME: 839 case UCHAR_UNICODE_1_NAME: 840 return UPROPS_SRC_NAMES; 841 842 default: 843 return UPROPS_SRC_NONE; 844 } 845 } else { 846 switch(which) { 847 case UCHAR_SCRIPT_EXTENSIONS: 848 case UCHAR_IDENTIFIER_TYPE: 849 return UPROPS_SRC_PROPSVEC; 850 default: 851 return UPROPS_SRC_NONE; /* undefined */ 852 } 853 } 854 } 855 856 U_CFUNC void U_EXPORT2 857 uprops_addPropertyStarts(UPropertySource src, const USetAdder *sa, UErrorCode *pErrorCode) { 858 if (U_FAILURE(*pErrorCode)) { return; } 859 if (src == UPROPS_SRC_ID_COMPAT_MATH) { 860 // range limits 861 for (UChar32 c : ID_COMPAT_MATH_CONTINUE) { 862 sa->add(sa->set, c); 863 } 864 // single characters 865 for (UChar32 c : ID_COMPAT_MATH_START) { 866 sa->add(sa->set, c); 867 sa->add(sa->set, c + 1); 868 } 869 return; 870 } 871 if (src == UPROPS_SRC_MCM) { 872 // range limits 873 for (UChar32 c : MODIFIER_COMBINING_MARK) { 874 sa->add(sa->set, c); 875 } 876 return; 877 } 878 if (!ulayout_ensureData(*pErrorCode)) { return; } 879 const UCPTrie *trie; 880 switch (src) { 881 case UPROPS_SRC_INPC: 882 trie = gInpcTrie; 883 break; 884 case UPROPS_SRC_INSC: 885 trie = gInscTrie; 886 break; 887 case UPROPS_SRC_VO: 888 trie = gVoTrie; 889 break; 890 default: 891 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 892 return; 893 } 894 895 if (trie == nullptr) { 896 *pErrorCode = U_MISSING_RESOURCE_ERROR; 897 return; 898 } 899 900 // Add the start code point of each same-value range of the trie. 901 UChar32 start = 0, end; 902 while ((end = ucptrie_getRange(trie, start, UCPMAP_RANGE_NORMAL, 0, 903 nullptr, nullptr, nullptr)) >= 0) { 904 sa->add(sa->set, start); 905 start = end + 1; 906 } 907 } 908 909 U_CAPI bool U_EXPORT2 910 u_hasIDType(UChar32 c, UIdentifierType type) { 911 uint32_t typeIndex = type; // also guards against negative type integers 912 if (typeIndex >= UPRV_LENGTHOF(uprops_idTypeToEncoded)) { 913 return false; 914 } 915 uint32_t encodedType = uprops_idTypeToEncoded[typeIndex]; 916 uint32_t value = u_getUnicodeProperties(c, 2) >> UPROPS_2_ID_TYPE_SHIFT; 917 if ((encodedType & UPROPS_ID_TYPE_BIT) != 0) { 918 return value < UPROPS_ID_TYPE_FORBIDDEN && (value & encodedType) != 0; 919 } else { 920 return value == encodedType; 921 } 922 } 923 924 namespace { 925 926 void maybeAppendType(uint32_t value, uint32_t bit, UIdentifierType t, 927 UIdentifierType *types, int32_t &length, int32_t capacity) { 928 if ((value & bit) != 0) { 929 if (length < capacity) { 930 types[length] = t; 931 } 932 ++length; 933 } 934 } 935 936 } // namespace 937 938 U_CAPI int32_t U_EXPORT2 939 u_getIDTypes(UChar32 c, UIdentifierType *types, int32_t capacity, UErrorCode *pErrorCode) { 940 if (U_FAILURE(*pErrorCode)) { return 0; } 941 if (capacity < 0 || (capacity > 0 && types == nullptr)) { 942 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 943 return 0; 944 } 945 uint32_t value = u_getUnicodeProperties(c, 2) >> UPROPS_2_ID_TYPE_SHIFT; 946 if ((value & UPROPS_ID_TYPE_FORBIDDEN) == UPROPS_ID_TYPE_FORBIDDEN || 947 value == UPROPS_ID_TYPE_NOT_CHARACTER) { 948 // single value 949 if (capacity > 0) { 950 UIdentifierType t; 951 switch (value) { 952 case UPROPS_ID_TYPE_NOT_CHARACTER: t = U_ID_TYPE_NOT_CHARACTER; break; 953 case UPROPS_ID_TYPE_DEPRECATED: t = U_ID_TYPE_DEPRECATED; break; 954 case UPROPS_ID_TYPE_DEFAULT_IGNORABLE: t = U_ID_TYPE_DEFAULT_IGNORABLE; break; 955 case UPROPS_ID_TYPE_NOT_NFKC: t = U_ID_TYPE_NOT_NFKC; break; 956 case UPROPS_ID_TYPE_INCLUSION: t = U_ID_TYPE_INCLUSION; break; 957 case UPROPS_ID_TYPE_RECOMMENDED: t = U_ID_TYPE_RECOMMENDED; break; 958 default: 959 *pErrorCode = U_INVALID_FORMAT_ERROR; 960 return 0; 961 } 962 types[0] = t; 963 } else { 964 *pErrorCode = U_BUFFER_OVERFLOW_ERROR; 965 } 966 return 1; 967 } else { 968 // one or more combinable bits 969 int32_t length = 0; 970 maybeAppendType(value, UPROPS_ID_TYPE_NOT_XID, U_ID_TYPE_NOT_XID, 971 types, length, capacity); 972 maybeAppendType(value, UPROPS_ID_TYPE_EXCLUSION, U_ID_TYPE_EXCLUSION, 973 types, length, capacity); 974 maybeAppendType(value, UPROPS_ID_TYPE_OBSOLETE, U_ID_TYPE_OBSOLETE, 975 types, length, capacity); 976 maybeAppendType(value, UPROPS_ID_TYPE_TECHNICAL, U_ID_TYPE_TECHNICAL, 977 types, length, capacity); 978 maybeAppendType(value, UPROPS_ID_TYPE_UNCOMMON_USE, U_ID_TYPE_UNCOMMON_USE, 979 types, length, capacity); 980 maybeAppendType(value, UPROPS_ID_TYPE_LIMITED_USE, U_ID_TYPE_LIMITED_USE, 981 types, length, capacity); 982 if (length >= capacity) { 983 *pErrorCode = U_BUFFER_OVERFLOW_ERROR; 984 } 985 return length; 986 } 987 } 988 989 #if !UCONFIG_NO_NORMALIZATION 990 991 U_CAPI int32_t U_EXPORT2 992 u_getFC_NFKC_Closure(UChar32 c, char16_t *dest, int32_t destCapacity, UErrorCode *pErrorCode) { 993 if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { 994 return 0; 995 } 996 if(destCapacity<0 || (dest==nullptr && destCapacity>0)) { 997 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 998 return 0; 999 } 1000 // Compute the FC_NFKC_Closure on the fly: 1001 // We have the API for complete coverage of Unicode properties, although 1002 // this value by itself is not useful via API. 1003 // (What could be useful is a custom normalization table that combines 1004 // case folding and NFKC.) 1005 // For the derivation, see Unicode's DerivedNormalizationProps.txt. 1006 const Normalizer2 *nfkc=Normalizer2::getNFKCInstance(*pErrorCode); 1007 if(U_FAILURE(*pErrorCode)) { 1008 return 0; 1009 } 1010 // first: b = NFKC(Fold(a)) 1011 UnicodeString folded1String; 1012 const char16_t *folded1; 1013 int32_t folded1Length=ucase_toFullFolding(c, &folded1, U_FOLD_CASE_DEFAULT); 1014 if(folded1Length<0) { 1015 const Normalizer2Impl *nfkcImpl=Normalizer2Factory::getImpl(nfkc); 1016 if(nfkcImpl->getCompQuickCheck(nfkcImpl->getNorm16(c))!=UNORM_NO) { 1017 return u_terminateUChars(dest, destCapacity, 0, pErrorCode); // c does not change at all under CaseFolding+NFKC 1018 } 1019 folded1String.setTo(c); 1020 } else { 1021 if(folded1Length>UCASE_MAX_STRING_LENGTH) { 1022 folded1String.setTo(folded1Length); 1023 } else { 1024 folded1String.setTo(false, folded1, folded1Length); 1025 } 1026 } 1027 UnicodeString kc1=nfkc->normalize(folded1String, *pErrorCode); 1028 // second: c = NFKC(Fold(b)) 1029 UnicodeString folded2String(kc1); 1030 UnicodeString kc2=nfkc->normalize(folded2String.foldCase(), *pErrorCode); 1031 // if (c != b) add the mapping from a to c 1032 if(U_FAILURE(*pErrorCode) || kc1==kc2) { 1033 return u_terminateUChars(dest, destCapacity, 0, pErrorCode); 1034 } else { 1035 return kc2.extract(dest, destCapacity, *pErrorCode); 1036 } 1037 } 1038 1039 #endif