icuexportdata.cpp (62501B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 4 #include <cstddef> 5 #include <cstdint> 6 #include <cstdio> 7 #include <iostream> 8 #include "unicode/localpointer.h" 9 #include "unicode/umachine.h" 10 #include "unicode/unistr.h" 11 #include "unicode/urename.h" 12 #include "unicode/uset.h" 13 #include <vector> 14 #include <algorithm> 15 #include "toolutil.h" 16 #include "uoptions.h" 17 #include "cmemory.h" 18 #include "charstr.h" 19 #include "cstring.h" 20 #include "unicode/uchar.h" 21 #include "unicode/errorcode.h" 22 #include "unicode/uniset.h" 23 #include "unicode/uscript.h" 24 #include "unicode/putil.h" 25 #include "unicode/umutablecptrie.h" 26 #include "unicode/ucharstriebuilder.h" 27 #include "ucase.h" 28 #include "unicode/normalizer2.h" 29 #include "uprops.h" 30 #include "normalizer2impl.h" 31 #include "writesrc.h" 32 33 U_NAMESPACE_USE 34 35 /* 36 * Global - verbosity 37 */ 38 UBool VERBOSE = false; 39 UBool QUIET = false; 40 41 UBool haveCopyright = true; 42 UCPTrieType trieType = UCPTRIE_TYPE_SMALL; 43 const char* destdir = ""; 44 45 // Mask constants for modified values in the Script CodePointTrie, values are logically 12-bits. 46 int16_t DATAEXPORT_SCRIPT_X_WITH_COMMON = 0x0400; 47 int16_t DATAEXPORT_SCRIPT_X_WITH_INHERITED = 0x0800; 48 int16_t DATAEXPORT_SCRIPT_X_WITH_OTHER = 0x0c00; 49 50 void handleError(ErrorCode& status, int line, const char* context) { 51 if (status.isFailure()) { 52 std::cerr << "Error[" << line << "]: " << context << ": " << status.errorName() << std::endl; 53 exit(status.reset()); 54 } 55 } 56 57 class PropertyValueNameGetter : public ValueNameGetter { 58 public: 59 PropertyValueNameGetter(UProperty prop) : property(prop) {} 60 ~PropertyValueNameGetter() override; 61 const char *getName(uint32_t value) override { 62 return u_getPropertyValueName(property, value, U_SHORT_PROPERTY_NAME); 63 } 64 65 private: 66 UProperty property; 67 }; 68 69 PropertyValueNameGetter::~PropertyValueNameGetter() {} 70 71 // Dump an aliases = [...] key for properties with aliases 72 void dumpPropertyAliases(UProperty uproperty, FILE* f) { 73 int i = U_LONG_PROPERTY_NAME + 1; 74 75 while(true) { 76 // The API works by having extra names after U_LONG_PROPERTY_NAME, sequentially, 77 // and returning null after that 78 const char* alias = u_getPropertyName(uproperty, static_cast<UPropertyNameChoice>(i)); 79 if (!alias) { 80 break; 81 } 82 if (i == U_LONG_PROPERTY_NAME + 1) { 83 fprintf(f, "aliases = [\"%s\"", alias); 84 } else { 85 fprintf(f, ", \"%s\"", alias); 86 } 87 i++; 88 } 89 if (i != U_LONG_PROPERTY_NAME + 1) { 90 fprintf(f, "]\n"); 91 } 92 } 93 94 void dumpBinaryProperty(UProperty uproperty, FILE* f) { 95 IcuToolErrorCode status("icuexportdata: dumpBinaryProperty"); 96 const char* fullPropName = u_getPropertyName(uproperty, U_LONG_PROPERTY_NAME); 97 const char* shortPropName = u_getPropertyName(uproperty, U_SHORT_PROPERTY_NAME); 98 const USet* uset = u_getBinaryPropertySet(uproperty, status); 99 handleError(status, __LINE__, fullPropName); 100 101 fputs("[[binary_property]]\n", f); 102 fprintf(f, "long_name = \"%s\"\n", fullPropName); 103 if (shortPropName) fprintf(f, "short_name = \"%s\"\n", shortPropName); 104 fprintf(f, "uproperty_discr = 0x%X\n", uproperty); 105 dumpPropertyAliases(uproperty, f); 106 usrc_writeUnicodeSet(f, uset, UPRV_TARGET_SYNTAX_TOML); 107 } 108 109 // If the value exists, dump an indented entry of the format 110 // `" {discr = <discriminant>, long = <longname>, short = <shortname>, aliases = [<aliases>]},"` 111 void dumpValueEntry(UProperty uproperty, int v, bool is_mask, FILE* f) { 112 const char* fullValueName = u_getPropertyValueName(uproperty, v, U_LONG_PROPERTY_NAME); 113 const char* shortValueName = u_getPropertyValueName(uproperty, v, U_SHORT_PROPERTY_NAME); 114 if (!fullValueName) { 115 return; 116 } 117 if (is_mask) { 118 fprintf(f, " {discr = 0x%X", v); 119 } else { 120 fprintf(f, " {discr = %i", v); 121 } 122 fprintf(f, ", long = \"%s\"", fullValueName); 123 if (shortValueName) { 124 fprintf(f, ", short = \"%s\"", shortValueName); 125 } 126 int i = U_LONG_PROPERTY_NAME + 1; 127 while(true) { 128 // The API works by having extra names after U_LONG_PROPERTY_NAME, sequentially, 129 // and returning null after that 130 const char* alias = u_getPropertyValueName(uproperty, v, static_cast<UPropertyNameChoice>(i)); 131 if (!alias) { 132 break; 133 } 134 if (i == U_LONG_PROPERTY_NAME + 1) { 135 fprintf(f, ", aliases = [\"%s\"", alias); 136 } else { 137 fprintf(f, ", \"%s\"", alias); 138 } 139 i++; 140 } 141 if (i != U_LONG_PROPERTY_NAME + 1) { 142 fprintf(f, "]"); 143 } 144 fprintf(f, "},\n"); 145 } 146 147 void dumpEnumeratedProperty(UProperty uproperty, FILE* f) { 148 IcuToolErrorCode status("icuexportdata: dumpEnumeratedProperty"); 149 const char* fullPropName = u_getPropertyName(uproperty, U_LONG_PROPERTY_NAME); 150 const char* shortPropName = u_getPropertyName(uproperty, U_SHORT_PROPERTY_NAME); 151 const UCPMap* umap = u_getIntPropertyMap(uproperty, status); 152 handleError(status, __LINE__, fullPropName); 153 154 fputs("[[enum_property]]\n", f); 155 fprintf(f, "long_name = \"%s\"\n", fullPropName); 156 if (shortPropName) fprintf(f, "short_name = \"%s\"\n", shortPropName); 157 fprintf(f, "uproperty_discr = 0x%X\n", uproperty); 158 dumpPropertyAliases(uproperty, f); 159 160 int32_t minValue = u_getIntPropertyMinValue(uproperty); 161 U_ASSERT(minValue >= 0); 162 int32_t maxValue = u_getIntPropertyMaxValue(uproperty); 163 U_ASSERT(maxValue >= 0); 164 165 fprintf(f, "values = [\n"); 166 for (int v = minValue; v <= maxValue; v++) { 167 dumpValueEntry(uproperty, v, false, f); 168 } 169 fprintf(f, "]\n"); 170 171 PropertyValueNameGetter valueNameGetter(uproperty); 172 usrc_writeUCPMap(f, umap, &valueNameGetter, UPRV_TARGET_SYNTAX_TOML); 173 fputs("\n", f); 174 175 176 UCPTrieValueWidth width = UCPTRIE_VALUE_BITS_32; 177 if (maxValue <= 0xff) { 178 width = UCPTRIE_VALUE_BITS_8; 179 } else if (maxValue <= 0xffff) { 180 width = UCPTRIE_VALUE_BITS_16; 181 } 182 LocalUMutableCPTriePointer builder(umutablecptrie_fromUCPMap(umap, status)); 183 LocalUCPTriePointer utrie(umutablecptrie_buildImmutable( 184 builder.getAlias(), 185 trieType, 186 width, 187 status)); 188 handleError(status, __LINE__, fullPropName); 189 190 fputs("[enum_property.code_point_trie]\n", f); 191 usrc_writeUCPTrie(f, shortPropName, utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML); 192 } 193 194 /* 195 * Export Bidi_Mirroring_Glyph values (code points) in a similar way to how enumerated 196 * properties are dumped to file. 197 * Note: the data will store 0 for code points without a value defined for 198 * Bidi_Mirroring_Glyph. 199 */ 200 void dumpBidiMirroringGlyph(FILE* f) { 201 UProperty uproperty = UCHAR_BIDI_MIRRORING_GLYPH; 202 IcuToolErrorCode status("icuexportdata: dumpBidiMirroringGlyph"); 203 const char* fullPropName = u_getPropertyName(uproperty, U_LONG_PROPERTY_NAME); 204 const char* shortPropName = u_getPropertyName(uproperty, U_SHORT_PROPERTY_NAME); 205 handleError(status, __LINE__, fullPropName); 206 207 // Store 21-bit code point as is 208 UCPTrieValueWidth width = UCPTRIE_VALUE_BITS_32; 209 210 // note: unlike dumpEnumeratedProperty, which can get inversion map data using 211 // u_getIntPropertyMap(uproperty), the only reliable way to get Bidi_Mirroring_Glyph 212 // is to use u_charMirror(cp) over the code point space. 213 LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status)); 214 for(UChar32 c = UCHAR_MIN_VALUE; c <= UCHAR_MAX_VALUE; c++) { 215 UChar32 mirroringGlyph = u_charMirror(c); 216 // The trie builder code throws an error when it cannot compress the data sufficiently. 217 // Therefore, when the value is undefined for a code point, keep a 0 in the trie 218 // instead of the ICU API behavior of returning the code point value. Using 0 219 // results in a relatively significant space savings by not including redundant data. 220 if (c != mirroringGlyph) { 221 umutablecptrie_set(builder.getAlias(), c, mirroringGlyph, status); 222 } 223 } 224 225 LocalUCPTriePointer utrie(umutablecptrie_buildImmutable( 226 builder.getAlias(), 227 trieType, 228 width, 229 status)); 230 handleError(status, __LINE__, fullPropName); 231 232 // currently a trie and inversion map are the same (as relied upon in characterproperties.cpp) 233 const UCPMap* umap = reinterpret_cast<UCPMap *>(utrie.getAlias()); 234 235 fputs("[[enum_property]]\n", f); 236 fprintf(f, "long_name = \"%s\"\n", fullPropName); 237 if (shortPropName) { 238 fprintf(f, "short_name = \"%s\"\n", shortPropName); 239 } 240 fprintf(f, "uproperty_discr = 0x%X\n", uproperty); 241 dumpPropertyAliases(uproperty, f); 242 243 usrc_writeUCPMap(f, umap, nullptr, UPRV_TARGET_SYNTAX_TOML); 244 fputs("\n", f); 245 246 fputs("[enum_property.code_point_trie]\n", f); 247 usrc_writeUCPTrie(f, shortPropName, utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML); 248 } 249 250 // After printing property value `v`, print `mask` if and only if `mask` comes immediately 251 // after the property in the listing 252 void maybeDumpMaskValue(UProperty uproperty, uint32_t v, uint32_t mask, FILE* f) { 253 if (U_MASK(v) < mask && U_MASK(v + 1) > mask) 254 dumpValueEntry(uproperty, mask, true, f); 255 } 256 257 void dumpGeneralCategoryMask(FILE* f) { 258 IcuToolErrorCode status("icuexportdata: dumpGeneralCategoryMask"); 259 UProperty uproperty = UCHAR_GENERAL_CATEGORY_MASK; 260 261 fputs("[[mask_property]]\n", f); 262 const char* fullPropName = u_getPropertyName(uproperty, U_LONG_PROPERTY_NAME); 263 const char* shortPropName = u_getPropertyName(uproperty, U_SHORT_PROPERTY_NAME); 264 fprintf(f, "long_name = \"%s\"\n", fullPropName); 265 if (shortPropName) fprintf(f, "short_name = \"%s\"\n", shortPropName); 266 fprintf(f, "uproperty_discr = 0x%X\n", uproperty); 267 dumpPropertyAliases(uproperty, f); 268 269 270 fprintf(f, "mask_for = \"General_Category\"\n"); 271 int32_t minValue = u_getIntPropertyMinValue(UCHAR_GENERAL_CATEGORY); 272 U_ASSERT(minValue >= 0); 273 int32_t maxValue = u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY); 274 U_ASSERT(maxValue >= 0); 275 276 fprintf(f, "values = [\n"); 277 for (int32_t v = minValue; v <= maxValue; v++) { 278 dumpValueEntry(uproperty, U_MASK(v), true, f); 279 280 // We want to dump these masks "in order", which means they 281 // should come immediately after every property they contain 282 maybeDumpMaskValue(uproperty, v, U_GC_L_MASK, f); 283 maybeDumpMaskValue(uproperty, v, U_GC_LC_MASK, f); 284 maybeDumpMaskValue(uproperty, v, U_GC_M_MASK, f); 285 maybeDumpMaskValue(uproperty, v, U_GC_N_MASK, f); 286 maybeDumpMaskValue(uproperty, v, U_GC_Z_MASK, f); 287 maybeDumpMaskValue(uproperty, v, U_GC_C_MASK, f); 288 maybeDumpMaskValue(uproperty, v, U_GC_P_MASK, f); 289 maybeDumpMaskValue(uproperty, v, U_GC_S_MASK, f); 290 } 291 fprintf(f, "]\n"); 292 } 293 294 namespace { 295 296 void U_CALLCONV 297 set_add(USet *set, UChar32 c) { 298 UnicodeSet::fromUSet(set)->add(c); 299 } 300 301 void U_CALLCONV 302 set_addRange(USet *set, UChar32 start, UChar32 end) { 303 UnicodeSet::fromUSet(set)->add(start, end); 304 } 305 306 } 307 308 UnicodeSet getScriptExtensionsCodePoints(IcuToolErrorCode &errorCode) { 309 UnicodeSet scxCPs; 310 USetAdder sa = { 311 scxCPs.toUSet(), 312 set_add, 313 set_addRange, 314 nullptr, // don't need addString, 315 nullptr, // don't need remove() 316 nullptr // don't need removeRange() 317 }; 318 uprv_addScriptExtensionsCodePoints(&sa, errorCode); 319 return scxCPs; 320 } 321 322 void dumpScriptExtensions(FILE* f) { 323 IcuToolErrorCode status("icuexportdata: dumpScriptExtensions"); 324 325 fputs("[[script_extensions]]\n", f); 326 const char* scxFullPropName = u_getPropertyName(UCHAR_SCRIPT_EXTENSIONS, U_LONG_PROPERTY_NAME); 327 const char* scxShortPropName = u_getPropertyName(UCHAR_SCRIPT_EXTENSIONS, U_SHORT_PROPERTY_NAME); 328 fprintf(f, "long_name = \"%s\"\n", scxFullPropName); 329 if (scxShortPropName) fprintf(f, "short_name = \"%s\"\n", scxShortPropName); 330 fprintf(f, "uproperty_discr = 0x%X\n", UCHAR_SCRIPT_EXTENSIONS); 331 dumpPropertyAliases(UCHAR_SCRIPT_EXTENSIONS, f); 332 333 // We want to use 16 bits for our exported trie of sc/scx data because we 334 // need 12 bits to match the 12 bits of data stored for sc/scx in the trie 335 // in the uprops.icu data file. 336 UCPTrieValueWidth scWidth = UCPTRIE_VALUE_BITS_16; 337 338 // Create a mutable UCPTrie builder populated with Script property values data. 339 const UCPMap* scInvMap = u_getIntPropertyMap(UCHAR_SCRIPT, status); 340 handleError(status, __LINE__, scxFullPropName); 341 LocalUMutableCPTriePointer builder(umutablecptrie_fromUCPMap(scInvMap, status)); 342 handleError(status, __LINE__, scxFullPropName); 343 344 // The values for the output scx companion array. 345 // Invariant is that all subvectors are distinct. 346 std::vector< std::vector<uint16_t> > outputDedupVec; 347 348 // The sc/scx companion array is an array of arrays (of script codes) 349 fputs("script_code_array = [\n", f); 350 UnicodeSet scxCodePoints = getScriptExtensionsCodePoints(status); 351 for(const UChar32 cp : scxCodePoints.codePoints()) { 352 // Get the Script value 353 uint32_t scVal = umutablecptrie_get(builder.getAlias(), cp); 354 // Get the Script_Extensions value (array of Script codes) 355 const int32_t SCX_ARRAY_CAPACITY = 32; 356 UScriptCode scxValArray[SCX_ARRAY_CAPACITY]; 357 int32_t numScripts = uscript_getScriptExtensions(cp, scxValArray, SCX_ARRAY_CAPACITY, status); 358 handleError(status, __LINE__, scxFullPropName); 359 360 // Convert the scx array into a vector 361 std::vector<uint16_t> scxValVec; 362 for(int i = 0; i < numScripts; i++) { 363 scxValVec.push_back(scxValArray[i]); 364 } 365 // Ensure that it is sorted 366 std::sort(scxValVec.begin(), scxValVec.end()); 367 // Copy the Script value into the first position of the scx array only 368 // if we have the "other" case (Script value is not Common nor Inherited). 369 // This offers faster access when users want only the Script value. 370 if (scVal != USCRIPT_COMMON && scVal != USCRIPT_INHERITED) { 371 scxValVec.insert(scxValVec.begin(), scVal); 372 } 373 374 // See if there is already an scx value array matching the newly built one. 375 // If there is, then use its index. 376 // If not, then append the new value array. 377 bool isScxValUnique = true; 378 size_t outputIndex = 0; 379 for (outputIndex = 0; outputIndex < outputDedupVec.size(); outputIndex++) { 380 if (outputDedupVec[outputIndex] == scxValVec) { 381 isScxValUnique = false; 382 break; 383 } 384 } 385 386 if (isScxValUnique) { 387 outputDedupVec.push_back(scxValVec); 388 usrc_writeArray(f, " [", scxValVec.data(), 16, scxValVec.size(), " ", "],\n"); 389 } 390 391 // We must update the value in the UCPTrie for the code point to contain: 392 // 9..0 the Script code in the lower 10 bits when 11..10 is 0, else it is 393 // the index into the companion array 394 // 11..10 the same higher-order 2 bits in the trie in uprops.icu indicating whether 395 // 3: other 396 // 2: Script=Inherited 397 // 1: Script=Common 398 // 0: Script=value in 9..0 (N/A because we are in this loop to create the companion array for non-0 cases) 399 uint16_t mask = 0; 400 if (scVal == USCRIPT_COMMON) { 401 mask = DATAEXPORT_SCRIPT_X_WITH_COMMON; 402 } else if (scVal == USCRIPT_INHERITED) { 403 mask = DATAEXPORT_SCRIPT_X_WITH_INHERITED; 404 } else { 405 mask = DATAEXPORT_SCRIPT_X_WITH_OTHER; 406 } 407 408 // The new trie value is the index into the new array with the high order bits set 409 uint32_t newScVal = outputIndex | mask; 410 411 // Update the code point in the mutable trie builder with the trie value 412 umutablecptrie_set(builder.getAlias(), cp, newScVal, status); 413 handleError(status, __LINE__, scxFullPropName); 414 } 415 fputs("]\n\n", f); // Print the TOML close delimiter for the outer array. 416 417 // Convert from mutable trie builder to immutable trie. 418 LocalUCPTriePointer utrie(umutablecptrie_buildImmutable( 419 builder.getAlias(), 420 trieType, 421 scWidth, 422 status)); 423 handleError(status, __LINE__, scxFullPropName); 424 425 fputs("[script_extensions.code_point_trie]\n", f); 426 usrc_writeUCPTrie(f, scxShortPropName, utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML); 427 } 428 429 FILE* prepareOutputFile(const char* basename) { 430 IcuToolErrorCode status("icuexportdata"); 431 CharString outFileName; 432 if (destdir != nullptr && *destdir != 0) { 433 outFileName.append(destdir, status).ensureEndsWithFileSeparator(status); 434 } 435 outFileName.append(basename, status); 436 outFileName.append(".toml", status); 437 handleError(status, __LINE__, basename); 438 439 FILE* f = fopen(outFileName.data(), "w"); 440 if (f == nullptr) { 441 std::cerr << "Unable to open file: " << outFileName.data() << std::endl; 442 exit(U_FILE_ACCESS_ERROR); 443 } 444 if (!QUIET) { 445 std::cout << "Writing to: " << outFileName.data() << std::endl; 446 } 447 448 if (haveCopyright) { 449 usrc_writeCopyrightHeader(f, "#", 2021); 450 } 451 usrc_writeFileNameGeneratedBy(f, "#", basename, "icuexportdata.cpp"); 452 453 return f; 454 } 455 456 #if !UCONFIG_NO_NORMALIZATION 457 458 class PendingDescriptor { 459 public: 460 UChar32 scalar; 461 uint32_t descriptorOrFlags; 462 // If false, we use the above fields only. If true, descriptor only 463 // contains the two highest-bit flags and the rest is computed later 464 // from the fields below. 465 UBool complex; 466 UBool supplementary; 467 UBool onlyNonStartersInTrail; 468 uint32_t len; 469 uint32_t offset; 470 471 PendingDescriptor(UChar32 scalar, uint32_t descriptor); 472 PendingDescriptor(UChar32 scalar, uint32_t flags, UBool supplementary, UBool onlyNonStartersInTrail, uint32_t len, uint32_t offset); 473 }; 474 475 PendingDescriptor::PendingDescriptor(UChar32 scalar, uint32_t descriptor) 476 : scalar(scalar), descriptorOrFlags(descriptor), complex(false), supplementary(false), onlyNonStartersInTrail(false), len(0), offset(0) {} 477 478 PendingDescriptor::PendingDescriptor(UChar32 scalar, uint32_t flags, UBool supplementary, UBool onlyNonStartersInTrail, uint32_t len, uint32_t offset) 479 : scalar(scalar), descriptorOrFlags(flags), complex(true), supplementary(supplementary), onlyNonStartersInTrail(onlyNonStartersInTrail), len(len), offset(offset) {} 480 481 void writeCanonicalCompositions(USet* backwardCombiningStarters) { 482 IcuToolErrorCode status("icuexportdata: computeCanonicalCompositions"); 483 const char* basename = "compositions"; 484 FILE* f = prepareOutputFile(basename); 485 486 LocalPointer<UCharsTrieBuilder> backwardBuilder(new UCharsTrieBuilder(status), status); 487 488 const int32_t DECOMPOSITION_BUFFER_SIZE = 20; 489 UChar32 utf32[DECOMPOSITION_BUFFER_SIZE]; 490 491 const Normalizer2* nfc = Normalizer2::getNFCInstance(status); 492 for (UChar32 c = 0; c <= 0x10FFFF; ++c) { 493 if (c >= 0xD800 && c < 0xE000) { 494 // Surrogate 495 continue; 496 } 497 UnicodeString decomposition; 498 if (!nfc->getRawDecomposition(c, decomposition)) { 499 continue; 500 } 501 int32_t len = decomposition.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status); 502 if (len != 2) { 503 continue; 504 } 505 UChar32 starter = utf32[0]; 506 UChar32 second = utf32[1]; 507 UChar32 composite = nfc->composePair(starter, second); 508 if (composite < 0) { 509 continue; 510 } 511 if (c != composite) { 512 status.set(U_INTERNAL_PROGRAM_ERROR); 513 handleError(status, __LINE__, basename); 514 } 515 if (!u_getCombiningClass(second)) { 516 uset_add(backwardCombiningStarters, second); 517 } 518 if (composite >= 0xAC00 && composite <= 0xD7A3) { 519 // Hangul syllable 520 continue; 521 } 522 523 UnicodeString backward; 524 backward.append(second); 525 backward.append(starter); 526 backwardBuilder->add(backward, static_cast<int32_t>(composite), status); 527 } 528 UnicodeString canonicalCompositionTrie; 529 backwardBuilder->buildUnicodeString(USTRINGTRIE_BUILD_SMALL, canonicalCompositionTrie, status); 530 531 usrc_writeArray(f, "compositions = [\n ", canonicalCompositionTrie.getBuffer(), 16, canonicalCompositionTrie.length(), " ", "\n]\n"); 532 fclose(f); 533 handleError(status, __LINE__, basename); 534 } 535 536 void writeDecompositionTables(const char* basename, const uint16_t* ptr16, size_t len16, const uint32_t* ptr32, size_t len32) { 537 FILE* f = prepareOutputFile(basename); 538 usrc_writeArray(f, "scalars16 = [\n ", ptr16, 16, len16, " ", "\n]\n"); 539 usrc_writeArray(f, "scalars32 = [\n ", ptr32, 32, len32, " ", "\n]\n"); 540 fclose(f); 541 } 542 543 void pendingInsertionsToTrie(const char* basename, UMutableCPTrie* trie, const std::vector<PendingDescriptor>& pendingTrieInsertions, uint32_t baseSize16, uint32_t baseSize32, uint32_t supplementSize16) { 544 IcuToolErrorCode status("icuexportdata: pendingInsertionsToTrie"); 545 // Iterate backwards to insert lower code points in the trie first in case it matters 546 // for trie block allocation. 547 for (int32_t i = pendingTrieInsertions.size() - 1; i >= 0; --i) { 548 const PendingDescriptor& pending = pendingTrieInsertions[i]; 549 if (pending.complex) { 550 uint32_t additional = 0; 551 uint32_t offset = pending.offset; 552 uint32_t len = pending.len; 553 if (!pending.supplementary) { 554 len -= 2; 555 if (offset >= baseSize16) { 556 // This is a offset to supplementary 16-bit data. We have 557 // 16-bit base data and 32-bit base data before. However, 558 // the 16-bit base data length is already part of offset. 559 additional = baseSize32; 560 } 561 } else { 562 len -= 1; 563 if (offset >= baseSize32) { 564 // This is an offset to supplementary 32-bit data. We have 16-bit 565 // base data, 32-bit base data, and 16-bit supplementary data before. 566 // However, the 32-bit base data length is already part 567 // of offset. 568 additional = baseSize16 + supplementSize16; 569 } else { 570 // This is an offset to 32-bit base data. We have 16-bit 571 // base data before. 572 additional = baseSize16; 573 } 574 } 575 // +1 to make offset always non-zero 576 offset += 1; 577 if (offset + additional > 0xFFF) { 578 status.set(U_INTERNAL_PROGRAM_ERROR); 579 handleError(status, __LINE__, basename); 580 } 581 if (len > 7) { 582 status.set(U_INTERNAL_PROGRAM_ERROR); 583 handleError(status, __LINE__, basename); 584 } 585 umutablecptrie_set(trie, pending.scalar, pending.descriptorOrFlags | (uint32_t(pending.onlyNonStartersInTrail) << 4) | len | (offset + additional) << 16, status); 586 } else { 587 umutablecptrie_set(trie, pending.scalar, pending.descriptorOrFlags, status); 588 } 589 } 590 } 591 592 /// Marker that the decomposition does not round trip via NFC. 593 const uint32_t NON_ROUND_TRIP_MASK = (1 << 30); 594 595 /// Marker that the first character of the decomposition can combine 596 /// backwards. 597 const uint32_t BACKWARD_COMBINING_MASK = (1 << 31); 598 599 void writeDecompositionData(const char* basename, uint32_t baseSize16, uint32_t baseSize32, uint32_t supplementSize16, USet* uset, USet* reference, const std::vector<PendingDescriptor>& pendingTrieInsertions, const std::vector<PendingDescriptor>& nfdPendingTrieInsertions, char16_t passthroughCap) { 600 IcuToolErrorCode status("icuexportdata: writeDecompositionData"); 601 FILE* f = prepareOutputFile(basename); 602 603 // Zero is a magic number that means the character decomposes to itself. 604 LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status)); 605 606 if (uprv_strcmp(basename, "uts46d") != 0) { 607 // Make surrogates decompose to U+FFFD. Don't do this for UTS 46, since this 608 // optimization is only used by the UTF-16 slice mode, and UTS 46 is not 609 // supported in slice modes (which do not support ignorables). 610 // Mark these as potentially backward-combining, to make lead surrogates 611 // for non-BMP characters that are backward-combining count as 612 // backward-combining just in case, though the backward-combiningness 613 // is not actually being looked at today. 614 umutablecptrie_setRange(builder.getAlias(), 0xD800, 0xDFFF, NON_ROUND_TRIP_MASK | BACKWARD_COMBINING_MASK | 0xFFFD, status); 615 } 616 617 // Add a marker value for Hangul syllables 618 umutablecptrie_setRange(builder.getAlias(), 0xAC00, 0xD7A3, 1, status); 619 620 // First put the NFD data in the trie, to be partially overwritten in the NFKD and UTS 46 cases. 621 // This is easier that changing the logic that computes the pending insertions. 622 pendingInsertionsToTrie(basename, builder.getAlias(), nfdPendingTrieInsertions, baseSize16, baseSize32, supplementSize16); 623 pendingInsertionsToTrie(basename, builder.getAlias(), pendingTrieInsertions, baseSize16, baseSize32, supplementSize16); 624 LocalUCPTriePointer utrie(umutablecptrie_buildImmutable( 625 builder.getAlias(), 626 trieType, 627 UCPTRIE_VALUE_BITS_32, 628 status)); 629 handleError(status, __LINE__, basename); 630 631 // The ICU4X side has changed enough this whole block of expectation checking might be more appropriate to remove. 632 if (reference) { 633 if (uset_contains(reference, 0xFF9E) || uset_contains(reference, 0xFF9F) || !uset_contains(reference, 0x0345)) { 634 // NFD expectations don't hold. The set must not contain the half-width 635 // kana voicing marks and must contain iota subscript. 636 status.set(U_INTERNAL_PROGRAM_ERROR); 637 handleError(status, __LINE__, basename); 638 } 639 640 USet* halfWidthVoicing = uset_openEmpty(); 641 uset_add(halfWidthVoicing, 0xFF9E); 642 uset_add(halfWidthVoicing, 0xFF9F); 643 644 USet* iotaSubscript = uset_openEmpty(); 645 uset_add(iotaSubscript, 0x0345); 646 647 USet* halfWidthCheck = uset_cloneAsThawed(uset); 648 uset_removeAll(halfWidthCheck, reference); 649 if (!uset_equals(halfWidthCheck, halfWidthVoicing) && !uset_isEmpty(halfWidthCheck)) { 650 // The result was neither empty nor contained exactly 651 // the two half-width voicing marks. The ICU4X 652 // normalizer doesn't know how to deal with this case. 653 status.set(U_INTERNAL_PROGRAM_ERROR); 654 handleError(status, __LINE__, basename); 655 } 656 uset_close(halfWidthCheck); 657 658 USet* iotaCheck = uset_cloneAsThawed(reference); 659 uset_removeAll(iotaCheck, uset); 660 if (!(uset_equals(iotaCheck, iotaSubscript)) && !uset_isEmpty(iotaCheck)) { 661 // The result was neither empty nor contained exactly 662 // the iota subscript. The ICU4X normalizer doesn't 663 // know how to deal with this case. 664 status.set(U_INTERNAL_PROGRAM_ERROR); 665 handleError(status, __LINE__, basename); 666 } 667 668 uset_close(iotaSubscript); 669 uset_close(halfWidthVoicing); 670 } 671 fprintf(f, "cap = 0x%X\n", passthroughCap); 672 fprintf(f, "[trie]\n"); 673 usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML); 674 fclose(f); 675 handleError(status, __LINE__, basename); 676 } 677 678 // Find the slice `needle` within `storage` and return its index, failing which, 679 // append all elements of `needle` to `storage` and return the index of it at the end. 680 template<typename T> 681 size_t findOrAppend(std::vector<T>& storage, const UChar32* needle, size_t needleLen) { 682 // Last index where we might find the start of the complete needle. 683 // bounds check is `i + needleLen <= storage.size()` since the inner 684 // loop will range from `i` to `i + needleLen - 1` (the `-1` is why we use `<=`) 685 for (size_t i = 0; i + needleLen <= storage.size(); i++) { 686 for (size_t j = 0;; j++) { 687 if (j == needleLen) { 688 return i; // found a match 689 } 690 if (storage[i + j] != static_cast<uint32_t>(needle[j])) { 691 break; 692 } 693 } 694 } 695 // We didn't find anything. Append, keeping the append index in mind. 696 size_t index = storage.size(); 697 for(size_t i = 0; i < needleLen; i++) { 698 storage.push_back(static_cast<T>(needle[i])); 699 } 700 701 return index; 702 } 703 704 705 // Computes data for canonical decompositions 706 // See components/normalizer/trie-value-format.md in the ICU4X repo 707 // for documentation of the trie value format. 708 void computeDecompositions(const char* basename, 709 const USet* backwardCombiningStarters, 710 std::vector<uint16_t>& storage16, 711 std::vector<uint32_t>& storage32, 712 USet* decompositionStartsWithNonStarter, 713 USet* decompositionStartsWithBackwardCombiningStarter, 714 std::vector<PendingDescriptor>& pendingTrieInsertions, 715 UChar32& decompositionPassthroughBound, 716 UChar32& compositionPassthroughBound) { 717 IcuToolErrorCode status("icuexportdata: computeDecompositions"); 718 const Normalizer2* mainNormalizer; 719 const Normalizer2* nfdNormalizer = Normalizer2::getNFDInstance(status); 720 const Normalizer2* nfcNormalizer = Normalizer2::getNFCInstance(status); 721 FILE* f = nullptr; 722 std::vector<uint32_t> nonRecursive32; 723 LocalUMutableCPTriePointer nonRecursiveBuilder(umutablecptrie_open(0, 0, status)); 724 725 UBool uts46 = false; 726 727 if (uprv_strcmp(basename, "nfkd") == 0) { 728 mainNormalizer = Normalizer2::getNFKDInstance(status); 729 } else if (uprv_strcmp(basename, "uts46d") == 0) { 730 uts46 = true; 731 mainNormalizer = Normalizer2::getInstance(nullptr, "uts46", UNORM2_COMPOSE, status); 732 } else { 733 mainNormalizer = nfdNormalizer; 734 f = prepareOutputFile("decompositionex"); 735 } 736 737 // Max length as of Unicode 14 is 4 for NFD. For NFKD the max 738 // is 18 (U+FDFA; special-cased), and the next longest is 8 (U+FDFB). 739 const int32_t LONGEST_ENCODABLE_LENGTH_16 = 9; 740 const int32_t LONGEST_ENCODABLE_LENGTH_32 = 8; 741 const int32_t DECOMPOSITION_BUFFER_SIZE = 20; 742 UChar32 utf32[DECOMPOSITION_BUFFER_SIZE]; 743 const int32_t RAW_DECOMPOSITION_BUFFER_SIZE = 2; 744 UChar32 rawUtf32[RAW_DECOMPOSITION_BUFFER_SIZE]; 745 746 // Iterate over all scalar values excluding Hangul syllables. 747 // 748 // We go backwards in order to better find overlapping decompositions. 749 // 750 // As of Unicode 14: 751 // Iterate forward without overlap search: 752 // nfd: 16 size: 896, 32 size: 173 753 // nfkd: 16 size: 3854, 32 size: 179 754 // 755 // Iterate forward with overlap search: 756 // nfd: 16 size: 888, 32 size: 173 757 // nfkd: 16 size: 3266, 32 size: 179 758 // 759 // Iterate backward with overlap search: 760 // nfd: 16 size: 776, 32 size: 173 761 // nfkd: 16 size: 2941, 32 size: 179 762 // 763 // UChar32 is signed! 764 for (UChar32 c = 0x10FFFF; c >= 0; --c) { 765 if (c >= 0xAC00 && c <= 0xD7A3) { 766 // Hangul syllable 767 continue; 768 } 769 if (c >= 0xD800 && c < 0xE000) { 770 // Surrogate 771 continue; 772 } 773 if (c == 0xFFFD) { 774 // REPLACEMENT CHARACTER 775 // This character is a starter that decomposes to self, 776 // so without a special case here it would end up as 777 // passthrough-eligible in all normalizations forms. 778 // However, in the potentially-ill-formed UTF-8 case 779 // UTF-8 errors return U+FFFD from the iterator, and 780 // errors need to be treated as ineligible for 781 // passthrough on the slice fast path. By giving 782 // U+FFFD a trie value whose flags make it ineligible 783 // for passthrough avoids a specific U+FFFD branch on 784 // the passthrough fast path. 785 pendingTrieInsertions.push_back({c, NON_ROUND_TRIP_MASK | BACKWARD_COMBINING_MASK}); 786 continue; 787 } 788 UnicodeString src; 789 UnicodeString dst; 790 src.append(c); 791 if (mainNormalizer != nfdNormalizer) { 792 UnicodeString inter; 793 mainNormalizer->normalize(src, inter, status); 794 nfdNormalizer->normalize(inter, dst, status); 795 } else { 796 nfdNormalizer->normalize(src, dst, status); 797 } 798 799 UnicodeString nfc; 800 nfcNormalizer->normalize(dst, nfc, status); 801 UBool roundTripsViaCanonicalComposition = (src == nfc); 802 803 int32_t len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status); 804 805 if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) { 806 if (!uts46) { 807 status.set(U_INTERNAL_PROGRAM_ERROR); 808 handleError(status, __LINE__, basename); 809 } 810 } 811 if (len > DECOMPOSITION_BUFFER_SIZE) { 812 status.set(U_INTERNAL_PROGRAM_ERROR); 813 handleError(status, __LINE__, basename); 814 } 815 uint8_t firstCombiningClass = u_getCombiningClass(utf32[0]); 816 bool specialNonStarterDecomposition = false; 817 bool startsWithBackwardCombiningStarter = false; 818 if (firstCombiningClass) { 819 decompositionPassthroughBound = c; 820 compositionPassthroughBound = c; 821 uset_add(decompositionStartsWithNonStarter, c); 822 if (src != dst) { 823 if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344 || c == 0x0F73 || c == 0x0F75 || c == 0x0F81 || (c == 0xFF9E && utf32[0] == 0x3099) || (c == 0xFF9F && utf32[0] == 0x309A)) { 824 specialNonStarterDecomposition = true; 825 } else { 826 // A character whose decomposition starts with a non-starter and isn't the same as the character itself and isn't already hard-coded into ICU4X. 827 status.set(U_INTERNAL_PROGRAM_ERROR); 828 handleError(status, __LINE__, basename); 829 } 830 } 831 } else if (uset_contains(backwardCombiningStarters, utf32[0])) { 832 compositionPassthroughBound = c; 833 startsWithBackwardCombiningStarter = true; 834 uset_add(decompositionStartsWithBackwardCombiningStarter, c); 835 } 836 if (mainNormalizer != nfdNormalizer) { 837 UnicodeString nfd; 838 nfdNormalizer->normalize(src, nfd, status); 839 if (dst == nfd) { 840 continue; 841 } 842 decompositionPassthroughBound = c; 843 compositionPassthroughBound = c; 844 } 845 if (firstCombiningClass) { 846 len = 1; 847 if (specialNonStarterDecomposition) { 848 // Special marker 849 pendingTrieInsertions.push_back({c, NON_ROUND_TRIP_MASK | BACKWARD_COMBINING_MASK | 0xD900 | u_getCombiningClass(c)}); 850 } else { 851 // Use the surrogate range to store the canonical combining class 852 // XXX: Should non-started that decompose to self be marked as non-round-trippable in 853 // case such semantics turn out to be more useful for `NON_ROUND_TRIP_MASK`? 854 pendingTrieInsertions.push_back({c, BACKWARD_COMBINING_MASK | 0xD800 | static_cast<uint32_t>(firstCombiningClass)}); 855 } 856 continue; 857 } else { 858 if (src == dst) { 859 if (startsWithBackwardCombiningStarter) { 860 pendingTrieInsertions.push_back({c, BACKWARD_COMBINING_MASK}); 861 } 862 continue; 863 } 864 decompositionPassthroughBound = c; 865 // ICU4X hard-codes ANGSTROM SIGN 866 if (c != 0x212B && mainNormalizer == nfdNormalizer) { 867 UnicodeString raw; 868 if (!nfdNormalizer->getRawDecomposition(c, raw)) { 869 // We're always supposed to have a non-recursive decomposition 870 // if we had a recursive one. 871 status.set(U_INTERNAL_PROGRAM_ERROR); 872 handleError(status, __LINE__, basename); 873 } 874 // In addition to actual difference, put the whole range that contains characters 875 // with oxia into the non-recursive trie in order to catch cases where characters 876 // with oxia have singleton decompositions to corresponding characters with tonos. 877 // This way, the run-time decision to fall through can be done on the range 878 // without checking for individual characters inside the range. 879 if (raw != dst || (c >= 0x1F71 && c <= 0x1FFB)) { 880 int32_t rawLen = raw.toUTF32(rawUtf32, RAW_DECOMPOSITION_BUFFER_SIZE, status); 881 if (!rawLen) { 882 status.set(U_INTERNAL_PROGRAM_ERROR); 883 handleError(status, __LINE__, basename); 884 } 885 if (rawLen == 1) { 886 if (c >= 0xFFFF) { 887 status.set(U_INTERNAL_PROGRAM_ERROR); 888 handleError(status, __LINE__, basename); 889 } 890 umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, static_cast<uint32_t>(rawUtf32[0]), status); 891 } else if (rawUtf32[0] <= 0xFFFF && rawUtf32[1] <= 0xFFFF) { 892 if (!rawUtf32[0] || !rawUtf32[1]) { 893 status.set(U_INTERNAL_PROGRAM_ERROR); 894 handleError(status, __LINE__, basename); 895 } 896 // Swapped for consistency with the primary trie 897 uint32_t bmpPair = static_cast<uint32_t>(rawUtf32[1]) << 16 | static_cast<uint32_t>(rawUtf32[0]); 898 umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, bmpPair, status); 899 } else { 900 // Let's add 1 to index to make it always non-zero to distinguish 901 // it from the default zero. 902 uint32_t index = nonRecursive32.size() + 1; 903 nonRecursive32.push_back(static_cast<uint32_t>(rawUtf32[0])); 904 nonRecursive32.push_back(static_cast<uint32_t>(rawUtf32[1])); 905 if (index > 0xFFFF) { 906 status.set(U_INTERNAL_PROGRAM_ERROR); 907 handleError(status, __LINE__, basename); 908 } 909 umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, index << 16, status); 910 } 911 } 912 } 913 } 914 if (!roundTripsViaCanonicalComposition) { 915 compositionPassthroughBound = c; 916 } 917 if (!len) { 918 if (!uts46) { 919 status.set(U_INTERNAL_PROGRAM_ERROR); 920 handleError(status, __LINE__, basename); 921 } 922 pendingTrieInsertions.push_back({c, uint32_t(0xFFFFFFFF)}); 923 } else if (len == 1 && ((utf32[0] >= 0x1161 && utf32[0] <= 0x1175) || (utf32[0] >= 0x11A8 && utf32[0] <= 0x11C2))) { 924 // Singleton decompositions to conjoining jamo. 925 if (mainNormalizer == nfdNormalizer) { 926 // Not supposed to happen in NFD 927 status.set(U_INTERNAL_PROGRAM_ERROR); 928 handleError(status, __LINE__, basename); 929 } 930 pendingTrieInsertions.push_back({c, static_cast<uint32_t>(utf32[0]) | NON_ROUND_TRIP_MASK | (startsWithBackwardCombiningStarter ? BACKWARD_COMBINING_MASK : 0)}); 931 } else if (!startsWithBackwardCombiningStarter && len == 1 && utf32[0] <= 0xFFFF) { 932 pendingTrieInsertions.push_back({c, static_cast<uint32_t>(utf32[0]) | NON_ROUND_TRIP_MASK | (startsWithBackwardCombiningStarter ? BACKWARD_COMBINING_MASK : 0)}); 933 } else if (c != 0x212B && // ANGSTROM SIGN is special to make the Harfbuzz case branch less in the more common case. 934 !startsWithBackwardCombiningStarter && 935 len == 2 && 936 utf32[0] <= 0x7FFF && 937 utf32[1] <= 0x7FFF && 938 utf32[0] > 0x1F && 939 utf32[1] > 0x1F && 940 !u_getCombiningClass(utf32[0]) && 941 u_getCombiningClass(utf32[1])) { 942 for (int32_t i = 0; i < len; ++i) { 943 if (((utf32[i] == 0x0345) && (uprv_strcmp(basename, "uts46d") == 0)) || utf32[i] == 0xFF9E || utf32[i] == 0xFF9F) { 944 // Assert that iota subscript and half-width voicing marks never occur in these 945 // expansions in the normalization forms where they are special. 946 status.set(U_INTERNAL_PROGRAM_ERROR); 947 handleError(status, __LINE__, basename); 948 } 949 } 950 pendingTrieInsertions.push_back({c, static_cast<uint32_t>(utf32[0]) | (static_cast<uint32_t>(utf32[1]) << 15) | (roundTripsViaCanonicalComposition ? 0 : NON_ROUND_TRIP_MASK)}); 951 } else { 952 UBool supplementary = false; 953 UBool nonInitialStarter = false; 954 for (int32_t i = 0; i < len; ++i) { 955 if (((utf32[i] == 0x0345) && (uprv_strcmp(basename, "uts46d") == 0)) || utf32[i] == 0xFF9E || utf32[i] == 0xFF9F) { 956 // Assert that iota subscript and half-width voicing marks never occur in these 957 // expansions in the normalization forms where they are special. 958 status.set(U_INTERNAL_PROGRAM_ERROR); 959 handleError(status, __LINE__, basename); 960 } 961 962 if (utf32[i] > 0xFFFF) { 963 supplementary = true; 964 } 965 if (utf32[i] == 0) { 966 status.set(U_INTERNAL_PROGRAM_ERROR); 967 handleError(status, __LINE__, basename); 968 } 969 if (i != 0 && !u_getCombiningClass(utf32[i])) { 970 nonInitialStarter = true; 971 } 972 } 973 if (len == 1) { 974 // The format doesn't allow for length 1 for BMP, 975 // so if these ever occur, they need to be promoted 976 // to wider storage. As of Unicode 16 alpha, this 977 // case does not arise. 978 supplementary = true; 979 } 980 if (!supplementary) { 981 if (len > LONGEST_ENCODABLE_LENGTH_16 || !len || len == 1) { 982 if (len == 18 && c == 0xFDFA) { 983 // Special marker for the one character whose decomposition 984 // is too long. (Too long even if we took the fourth bit into use!) 985 pendingTrieInsertions.push_back({c, NON_ROUND_TRIP_MASK | 1}); 986 continue; 987 } else { 988 // Note: There's a fourth bit available, but let's error out 989 // if it's ever needed so that it doesn't get used without 990 // updating docs. 991 status.set(U_INTERNAL_PROGRAM_ERROR); 992 handleError(status, __LINE__, basename); 993 } 994 } 995 } else if (len > LONGEST_ENCODABLE_LENGTH_32 || !len) { 996 // Note: There's a fourth bit available, but let's error out 997 // if it's ever needed so that it doesn't get used without 998 // updating docs. 999 status.set(U_INTERNAL_PROGRAM_ERROR); 1000 handleError(status, __LINE__, basename); 1001 } 1002 1003 size_t index = 0; 1004 if (!supplementary) { 1005 index = findOrAppend(storage16, utf32, len); 1006 } else { 1007 index = findOrAppend(storage32, utf32, len); 1008 } 1009 pendingTrieInsertions.push_back({c, (startsWithBackwardCombiningStarter ? BACKWARD_COMBINING_MASK : 0) | (roundTripsViaCanonicalComposition ? 0 : NON_ROUND_TRIP_MASK), supplementary, !nonInitialStarter, uint32_t(len), uint32_t(index)}); 1010 } 1011 } 1012 if (storage16.size() + storage32.size() > 0xFFF) { 1013 // We actually have 14 bits available, but let's error out so 1014 // that docs can be updated when taking a reserved bit out of 1015 // potential future flag usage. 1016 status.set(U_INTERNAL_PROGRAM_ERROR); 1017 } 1018 if (f) { 1019 usrc_writeArray(f, "scalars32 = [\n ", nonRecursive32.data(), 32, nonRecursive32.size(), " ", "\n]\n"); 1020 1021 LocalUCPTriePointer utrie(umutablecptrie_buildImmutable( 1022 nonRecursiveBuilder.getAlias(), 1023 trieType, 1024 UCPTRIE_VALUE_BITS_32, 1025 status)); 1026 handleError(status, __LINE__, basename); 1027 1028 fprintf(f, "[trie]\n"); 1029 usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML); 1030 1031 fclose(f); 1032 } 1033 handleError(status, __LINE__, basename); 1034 } 1035 1036 #endif // !UCONFIG_NO_NORMALIZATION 1037 1038 enum { 1039 OPT_HELP_H, 1040 OPT_HELP_QUESTION_MARK, 1041 OPT_MODE, 1042 OPT_TRIE_TYPE, 1043 OPT_VERSION, 1044 OPT_DESTDIR, 1045 OPT_ALL, 1046 OPT_INDEX, 1047 OPT_COPYRIGHT, 1048 OPT_VERBOSE, 1049 OPT_QUIET, 1050 1051 OPT_COUNT 1052 }; 1053 1054 #define UOPTION_MODE UOPTION_DEF("mode", 'm', UOPT_REQUIRES_ARG) 1055 #define UOPTION_TRIE_TYPE UOPTION_DEF("trie-type", '\1', UOPT_REQUIRES_ARG) 1056 #define UOPTION_ALL UOPTION_DEF("all", '\1', UOPT_NO_ARG) 1057 #define UOPTION_INDEX UOPTION_DEF("index", '\1', UOPT_NO_ARG) 1058 1059 static UOption options[]={ 1060 UOPTION_HELP_H, 1061 UOPTION_HELP_QUESTION_MARK, 1062 UOPTION_MODE, 1063 UOPTION_TRIE_TYPE, 1064 UOPTION_VERSION, 1065 UOPTION_DESTDIR, 1066 UOPTION_ALL, 1067 UOPTION_INDEX, 1068 UOPTION_COPYRIGHT, 1069 UOPTION_VERBOSE, 1070 UOPTION_QUIET, 1071 }; 1072 1073 void printHelp(FILE* stdfile, const char* program) { 1074 fprintf(stdfile, 1075 "usage: %s -m mode [-options] [--all | properties...]\n" 1076 "\tdump Unicode property data to .toml files\n" 1077 "options:\n" 1078 "\t-h or -? or --help this usage text\n" 1079 "\t-V or --version show a version message\n" 1080 "\t-m or --mode mode: currently only 'uprops', 'ucase', and 'norm', but more may be added\n" 1081 "\t --trie-type set the trie type (small or fast, default small)\n" 1082 "\t-d or --destdir destination directory, followed by the path\n" 1083 "\t --all write out all properties known to icuexportdata\n" 1084 "\t --index write an _index.toml summarizing all data exported\n" 1085 "\t-c or --copyright include a copyright notice\n" 1086 "\t-v or --verbose Turn on verbose output\n" 1087 "\t-q or --quiet do not display warnings and progress\n", 1088 program); 1089 } 1090 1091 int exportUprops(int argc, char* argv[]) { 1092 // Load list of Unicode properties 1093 std::vector<const char*> propNames; 1094 for (int i=1; i<argc; i++) { 1095 propNames.push_back(argv[i]); 1096 } 1097 if (options[OPT_ALL].doesOccur) { 1098 int i = UCHAR_BINARY_START; 1099 while (true) { 1100 if (i == UCHAR_BINARY_LIMIT) { 1101 i = UCHAR_INT_START; 1102 } 1103 if (i == UCHAR_INT_LIMIT) { 1104 i = UCHAR_GENERAL_CATEGORY_MASK; 1105 } 1106 if (i == UCHAR_GENERAL_CATEGORY_MASK + 1) { 1107 i = UCHAR_BIDI_MIRRORING_GLYPH; 1108 } 1109 if (i == UCHAR_BIDI_MIRRORING_GLYPH + 1) { 1110 i = UCHAR_SCRIPT_EXTENSIONS; 1111 } 1112 if (i == UCHAR_SCRIPT_EXTENSIONS + 1) { 1113 break; 1114 } 1115 UProperty uprop = static_cast<UProperty>(i); 1116 const char* propName = u_getPropertyName(uprop, U_SHORT_PROPERTY_NAME); 1117 if (propName == nullptr) { 1118 propName = u_getPropertyName(uprop, U_LONG_PROPERTY_NAME); 1119 if (propName != nullptr && VERBOSE) { 1120 std::cerr << "Note: falling back to long name for: " << propName << std::endl; 1121 } 1122 } 1123 if (propName != nullptr) { 1124 propNames.push_back(propName); 1125 } else { 1126 std::cerr << "Warning: Could not find name for: " << uprop << std::endl; 1127 } 1128 i++; 1129 } 1130 } 1131 1132 if (propNames.empty() 1133 || options[OPT_HELP_H].doesOccur 1134 || options[OPT_HELP_QUESTION_MARK].doesOccur 1135 || !options[OPT_MODE].doesOccur) { 1136 FILE *stdfile=argc<0 ? stderr : stdout; 1137 fprintf(stdfile, 1138 "usage: %s -m uprops [-options] [--all | properties...]\n" 1139 "\tdump Unicode property data to .toml files\n" 1140 "options:\n" 1141 "\t-h or -? or --help this usage text\n" 1142 "\t-V or --version show a version message\n" 1143 "\t-m or --mode mode: currently only 'uprops', but more may be added\n" 1144 "\t --trie-type set the trie type (small or fast, default small)\n" 1145 "\t-d or --destdir destination directory, followed by the path\n" 1146 "\t --all write out all properties known to icuexportdata\n" 1147 "\t --index write an _index.toml summarizing all data exported\n" 1148 "\t-c or --copyright include a copyright notice\n" 1149 "\t-v or --verbose Turn on verbose output\n" 1150 "\t-q or --quiet do not display warnings and progress\n", 1151 argv[0]); 1152 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; 1153 } 1154 1155 const char* mode = options[OPT_MODE].value; 1156 if (uprv_strcmp(mode, "uprops") != 0) { 1157 fprintf(stderr, "Invalid option for --mode (must be uprops)\n"); 1158 return U_ILLEGAL_ARGUMENT_ERROR; 1159 } 1160 1161 if (options[OPT_TRIE_TYPE].doesOccur) { 1162 if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "fast") == 0) { 1163 trieType = UCPTRIE_TYPE_FAST; 1164 } else if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "small") == 0) { 1165 trieType = UCPTRIE_TYPE_SMALL; 1166 } else { 1167 fprintf(stderr, "Invalid option for --trie-type (must be small or fast)\n"); 1168 return U_ILLEGAL_ARGUMENT_ERROR; 1169 } 1170 } 1171 1172 for (const char* propName : propNames) { 1173 UProperty propEnum = u_getPropertyEnum(propName); 1174 if (propEnum == UCHAR_INVALID_CODE) { 1175 std::cerr << "Error: Invalid property alias: " << propName << std::endl; 1176 return U_ILLEGAL_ARGUMENT_ERROR; 1177 } 1178 1179 FILE* f = prepareOutputFile(propName); 1180 1181 UVersionInfo versionInfo; 1182 u_getUnicodeVersion(versionInfo); 1183 char uvbuf[U_MAX_VERSION_STRING_LENGTH]; 1184 u_versionToString(versionInfo, uvbuf); 1185 fprintf(f, "icu_version = \"%s\"\nunicode_version = \"%s\"\n\n", 1186 U_ICU_VERSION, 1187 uvbuf); 1188 1189 if (propEnum < UCHAR_BINARY_LIMIT) { 1190 dumpBinaryProperty(propEnum, f); 1191 } else if (UCHAR_INT_START <= propEnum && propEnum <= UCHAR_INT_LIMIT) { 1192 dumpEnumeratedProperty(propEnum, f); 1193 } else if (propEnum == UCHAR_GENERAL_CATEGORY_MASK) { 1194 dumpGeneralCategoryMask(f); 1195 } else if (propEnum == UCHAR_BIDI_MIRRORING_GLYPH) { 1196 dumpBidiMirroringGlyph(f); 1197 } else if (propEnum == UCHAR_SCRIPT_EXTENSIONS) { 1198 dumpScriptExtensions(f); 1199 } else { 1200 std::cerr << "Don't know how to write property: " << propEnum << std::endl; 1201 return U_INTERNAL_PROGRAM_ERROR; 1202 } 1203 1204 fclose(f); 1205 } 1206 1207 if (options[OPT_INDEX].doesOccur) { 1208 FILE* f = prepareOutputFile("_index"); 1209 fprintf(f, "index = [\n"); 1210 for (const char* propName : propNames) { 1211 // At this point, propName is a valid property name, so it should be alphanum ASCII 1212 fprintf(f, " { filename=\"%s.toml\" },\n", propName); 1213 } 1214 fprintf(f, "]\n"); 1215 fclose(f); 1216 } 1217 1218 return 0; 1219 } 1220 1221 struct AddRangeHelper { 1222 UMutableCPTrie* ucptrie; 1223 }; 1224 1225 static UBool U_CALLCONV 1226 addRangeToUCPTrie(const void* context, UChar32 start, UChar32 end, uint32_t value) { 1227 IcuToolErrorCode status("addRangeToUCPTrie"); 1228 UMutableCPTrie* ucptrie = static_cast<const AddRangeHelper*>(context)->ucptrie; 1229 umutablecptrie_setRange(ucptrie, start, end, value, status); 1230 handleError(status, __LINE__, "setRange"); 1231 1232 return true; 1233 } 1234 1235 int exportCase(int argc, char* argv[]) { 1236 if (argc > 1) { 1237 fprintf(stderr, "ucase mode does not expect additional arguments\n"); 1238 return U_ILLEGAL_ARGUMENT_ERROR; 1239 } 1240 (void) argv; // Suppress unused variable warning 1241 1242 IcuToolErrorCode status("icuexportdata"); 1243 LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status)); 1244 handleError(status, __LINE__, "exportCase"); 1245 1246 int32_t exceptionsLength, unfoldLength; 1247 const UCaseProps *caseProps = ucase_getSingleton(&exceptionsLength, &unfoldLength); 1248 const UTrie2* caseTrie = &caseProps->trie; 1249 1250 AddRangeHelper helper = { builder.getAlias() }; 1251 utrie2_enum(caseTrie, nullptr, addRangeToUCPTrie, &helper); 1252 1253 UCPTrieValueWidth width = UCPTRIE_VALUE_BITS_16; 1254 LocalUCPTriePointer utrie(umutablecptrie_buildImmutable( 1255 builder.getAlias(), 1256 trieType, 1257 width, 1258 status)); 1259 handleError(status, __LINE__, "exportCase"); 1260 1261 FILE* f = prepareOutputFile("ucase"); 1262 1263 UVersionInfo versionInfo; 1264 u_getUnicodeVersion(versionInfo); 1265 char uvbuf[U_MAX_VERSION_STRING_LENGTH]; 1266 u_versionToString(versionInfo, uvbuf); 1267 fprintf(f, "icu_version = \"%s\"\nunicode_version = \"%s\"\n\n", 1268 U_ICU_VERSION, 1269 uvbuf); 1270 1271 fputs("[ucase.code_point_trie]\n", f); 1272 usrc_writeUCPTrie(f, "case_trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML); 1273 fputs("\n", f); 1274 1275 const char* indent = " "; 1276 const char* suffix = "\n]\n"; 1277 1278 fputs("[ucase.exceptions]\n", f); 1279 const char* exceptionsPrefix = "exceptions = [\n "; 1280 int32_t exceptionsWidth = 16; 1281 usrc_writeArray(f, exceptionsPrefix, caseProps->exceptions, exceptionsWidth, 1282 exceptionsLength, indent, suffix); 1283 fputs("\n", f); 1284 1285 fputs("[ucase.unfold]\n", f); 1286 const char* unfoldPrefix = "unfold = [\n "; 1287 int32_t unfoldWidth = 16; 1288 usrc_writeArray(f, unfoldPrefix, caseProps->unfold, unfoldWidth, 1289 unfoldLength, indent, suffix); 1290 1291 return 0; 1292 } 1293 1294 #if !UCONFIG_NO_NORMALIZATION 1295 1296 int exportNorm() { 1297 IcuToolErrorCode status("icuexportdata: exportNorm"); 1298 USet* backwardCombiningStarters = uset_openEmpty(); 1299 writeCanonicalCompositions(backwardCombiningStarters); 1300 1301 std::vector<uint16_t> storage16; 1302 std::vector<uint32_t> storage32; 1303 1304 // Note: the USets are not exported. They are only used to check that a new 1305 // Unicode version doesn't violate expectations that are hard-coded in ICU4X. 1306 USet* nfdDecompositionStartsWithNonStarter = uset_openEmpty(); 1307 USet* nfdDecompositionStartsWithBackwardCombiningStarter = uset_openEmpty(); 1308 std::vector<PendingDescriptor> nfdPendingTrieInsertions; 1309 UChar32 nfdBound = 0x10FFFF; 1310 UChar32 nfcBound = 0x10FFFF; 1311 computeDecompositions("nfd", 1312 backwardCombiningStarters, 1313 storage16, 1314 storage32, 1315 nfdDecompositionStartsWithNonStarter, 1316 nfdDecompositionStartsWithBackwardCombiningStarter, 1317 nfdPendingTrieInsertions, 1318 nfdBound, 1319 nfcBound); 1320 if (!(nfdBound == 0xC0 && nfcBound == 0x300)) { 1321 // Unexpected bounds for NFD/NFC. 1322 status.set(U_INTERNAL_PROGRAM_ERROR); 1323 handleError(status, __LINE__, "exportNorm"); 1324 } 1325 1326 uint32_t baseSize16 = storage16.size(); 1327 uint32_t baseSize32 = storage32.size(); 1328 1329 USet* nfkdDecompositionStartsWithNonStarter = uset_openEmpty(); 1330 USet* nfkdDecompositionStartsWithBackwardCombiningStarter = uset_openEmpty(); 1331 std::vector<PendingDescriptor> nfkdPendingTrieInsertions; 1332 UChar32 nfkdBound = 0x10FFFF; 1333 UChar32 nfkcBound = 0x10FFFF; 1334 computeDecompositions("nfkd", 1335 backwardCombiningStarters, 1336 storage16, 1337 storage32, 1338 nfkdDecompositionStartsWithNonStarter, 1339 nfkdDecompositionStartsWithBackwardCombiningStarter, 1340 nfkdPendingTrieInsertions, 1341 nfkdBound, 1342 nfkcBound); 1343 if (!(nfkdBound <= 0xC0 && nfkcBound <= 0x300)) { 1344 status.set(U_INTERNAL_PROGRAM_ERROR); 1345 handleError(status, __LINE__, "exportNorm"); 1346 } 1347 if (nfkcBound > 0xC0) { 1348 if (nfkdBound != 0xC0) { 1349 status.set(U_INTERNAL_PROGRAM_ERROR); 1350 handleError(status, __LINE__, "exportNorm"); 1351 } 1352 } else { 1353 if (nfkdBound != nfkcBound) { 1354 status.set(U_INTERNAL_PROGRAM_ERROR); 1355 handleError(status, __LINE__, "exportNorm"); 1356 } 1357 } 1358 1359 USet* uts46DecompositionStartsWithNonStarter = uset_openEmpty(); 1360 USet* uts46DecompositionStartsWithBackwardCombiningStarter = uset_openEmpty(); 1361 std::vector<PendingDescriptor> uts46PendingTrieInsertions; 1362 UChar32 uts46dBound = 0x10FFFF; 1363 UChar32 uts46Bound = 0x10FFFF; 1364 computeDecompositions("uts46d", 1365 backwardCombiningStarters, 1366 storage16, 1367 storage32, 1368 uts46DecompositionStartsWithNonStarter, 1369 uts46DecompositionStartsWithBackwardCombiningStarter, 1370 uts46PendingTrieInsertions, 1371 uts46dBound, 1372 uts46Bound); 1373 if (!(uts46dBound <= 0xC0 && uts46Bound <= 0x300)) { 1374 status.set(U_INTERNAL_PROGRAM_ERROR); 1375 handleError(status, __LINE__, "exportNorm"); 1376 } 1377 if (uts46Bound > 0xC0) { 1378 if (uts46dBound != 0xC0) { 1379 status.set(U_INTERNAL_PROGRAM_ERROR); 1380 handleError(status, __LINE__, "exportNorm"); 1381 } 1382 } else { 1383 if (uts46dBound != uts46Bound) { 1384 status.set(U_INTERNAL_PROGRAM_ERROR); 1385 handleError(status, __LINE__, "exportNorm"); 1386 } 1387 } 1388 1389 uint32_t supplementSize16 = storage16.size() - baseSize16; 1390 uint32_t supplementSize32 = storage32.size() - baseSize32; 1391 1392 writeDecompositionData("nfd", baseSize16, baseSize32, supplementSize16, nfdDecompositionStartsWithNonStarter, nullptr, nfdPendingTrieInsertions, nfdPendingTrieInsertions, static_cast<char16_t>(nfcBound)); 1393 writeDecompositionData("nfkd", baseSize16, baseSize32, supplementSize16, nfkdDecompositionStartsWithNonStarter, nfdDecompositionStartsWithNonStarter, nfkdPendingTrieInsertions, nfdPendingTrieInsertions, static_cast<char16_t>(nfkcBound)); 1394 writeDecompositionData("uts46d", baseSize16, baseSize32, supplementSize16, uts46DecompositionStartsWithNonStarter, nfdDecompositionStartsWithNonStarter, uts46PendingTrieInsertions, nfdPendingTrieInsertions, static_cast<char16_t>(uts46Bound)); 1395 1396 writeDecompositionTables("nfdex", storage16.data(), baseSize16, storage32.data(), baseSize32); 1397 writeDecompositionTables("nfkdex", storage16.data() + baseSize16, supplementSize16, storage32.data() + baseSize32, supplementSize32); 1398 1399 uset_close(nfdDecompositionStartsWithNonStarter); 1400 uset_close(nfkdDecompositionStartsWithNonStarter); 1401 uset_close(uts46DecompositionStartsWithNonStarter); 1402 1403 uset_close(nfdDecompositionStartsWithBackwardCombiningStarter); 1404 uset_close(nfkdDecompositionStartsWithBackwardCombiningStarter); 1405 uset_close(uts46DecompositionStartsWithBackwardCombiningStarter); 1406 1407 uset_close(backwardCombiningStarters); 1408 handleError(status, __LINE__, "exportNorm"); 1409 return 0; 1410 } 1411 1412 #endif // !UCONFIG_NO_NORMALIZATION 1413 1414 int main(int argc, char* argv[]) { 1415 U_MAIN_INIT_ARGS(argc, argv); 1416 1417 /* preset then read command line options */ 1418 options[OPT_DESTDIR].value=u_getDataDirectory(); 1419 argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options); 1420 1421 if(options[OPT_VERSION].doesOccur) { 1422 printf("icuexportdata version %s, ICU tool to dump data files for external consumers\n", 1423 U_ICU_DATA_VERSION); 1424 printf("%s\n", U_COPYRIGHT_STRING); 1425 exit(0); 1426 } 1427 1428 /* error handling, printing usage message */ 1429 if(argc<0) { 1430 fprintf(stderr, 1431 "error in command line argument \"%s\"\n", 1432 argv[-argc]); 1433 } 1434 1435 if (argc < 0 1436 || options[OPT_HELP_H].doesOccur 1437 || options[OPT_HELP_QUESTION_MARK].doesOccur 1438 || !options[OPT_MODE].doesOccur) { 1439 FILE *stdfile=argc<0 ? stderr : stdout; 1440 printHelp(stdfile, argv[0]); 1441 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; 1442 } 1443 1444 /* get the options values */ 1445 haveCopyright = options[OPT_COPYRIGHT].doesOccur; 1446 destdir = options[OPT_DESTDIR].value; 1447 VERBOSE = options[OPT_VERBOSE].doesOccur; 1448 QUIET = options[OPT_QUIET].doesOccur; 1449 1450 if (options[OPT_TRIE_TYPE].doesOccur) { 1451 if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "fast") == 0) { 1452 trieType = UCPTRIE_TYPE_FAST; 1453 } else if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "small") == 0) { 1454 trieType = UCPTRIE_TYPE_SMALL; 1455 } else { 1456 fprintf(stderr, "Invalid option for --trie-type (must be small or fast)\n"); 1457 return U_ILLEGAL_ARGUMENT_ERROR; 1458 } 1459 } 1460 1461 const char* mode = options[OPT_MODE].value; 1462 if (uprv_strcmp(mode, "norm") == 0) { 1463 #if !UCONFIG_NO_NORMALIZATION 1464 return exportNorm(); 1465 #else 1466 fprintf(stderr, "Exporting normalization data not supported when compiling without normalization support.\n"); 1467 return U_ILLEGAL_ARGUMENT_ERROR; 1468 #endif 1469 } 1470 if (uprv_strcmp(mode, "uprops") == 0) { 1471 return exportUprops(argc, argv); 1472 } else if (uprv_strcmp(mode, "ucase") == 0) { 1473 return exportCase(argc, argv); 1474 } 1475 1476 fprintf(stderr, "Invalid option for --mode (must be uprops, ucase, or norm)\n"); 1477 return U_ILLEGAL_ARGUMENT_ERROR; 1478 }