parse.cpp (80186B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 1998-2015, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * 11 * File parse.cpp 12 * 13 * Modification History: 14 * 15 * Date Name Description 16 * 05/26/99 stephen Creation. 17 * 02/25/00 weiv Overhaul to write udata 18 * 5/10/01 Ram removed ustdio dependency 19 * 06/10/2001 Dominic Ludlam <dom@recoil.org> Rewritten 20 ******************************************************************************* 21 */ 22 23 // Safer use of UnicodeString. 24 #include <cstdint> 25 #include "unicode/umachine.h" 26 #ifndef UNISTR_FROM_CHAR_EXPLICIT 27 # define UNISTR_FROM_CHAR_EXPLICIT explicit 28 #endif 29 30 // Less important, but still a good idea. 31 #ifndef UNISTR_FROM_STRING_EXPLICIT 32 # define UNISTR_FROM_STRING_EXPLICIT explicit 33 #endif 34 35 #include <assert.h> 36 #include "parse.h" 37 #include "errmsg.h" 38 #include "uhash.h" 39 #include "cmemory.h" 40 #include "cstring.h" 41 #include "uinvchar.h" 42 #include "read.h" 43 #include "ustr.h" 44 #include "reslist.h" 45 #include "rbt_pars.h" 46 #include "genrb.h" 47 #include "unicode/normalizer2.h" 48 #include "unicode/stringpiece.h" 49 #include "unicode/unistr.h" 50 #include "unicode/ustring.h" 51 #include "unicode/uscript.h" 52 #include "unicode/utf16.h" 53 #include "unicode/putil.h" 54 #include "charstr.h" 55 #include "collationbuilder.h" 56 #include "collationdata.h" 57 #include "collationdatareader.h" 58 #include "collationdatawriter.h" 59 #include "collationfastlatinbuilder.h" 60 #include "collationinfo.h" 61 #include "collationroot.h" 62 #include "collationruleparser.h" 63 #include "collationtailoring.h" 64 #include <stdio.h> 65 #include "writesrc.h" 66 67 /* Number of tokens to read ahead of the current stream position */ 68 #define MAX_LOOKAHEAD 3 69 70 #define CR 0x000D 71 #define LF 0x000A 72 #define SPACE 0x0020 73 #define TAB 0x0009 74 #define ESCAPE 0x005C 75 #define HASH 0x0023 76 #define QUOTE 0x0027 77 #define ZERO 0x0030 78 #define STARTCOMMAND 0x005B 79 #define ENDCOMMAND 0x005D 80 #define OPENSQBRACKET 0x005B 81 #define CLOSESQBRACKET 0x005D 82 83 #define ICU4X_DIACRITIC_BASE 0x0300 84 #define ICU4X_DIACRITIC_LIMIT 0x034F 85 86 using icu::CharString; 87 using icu::LocalMemory; 88 using icu::LocalPointer; 89 using icu::LocalUCHARBUFPointer; 90 using icu::StringPiece; 91 using icu::UnicodeString; 92 93 struct Lookahead 94 { 95 enum ETokenType type; 96 struct UString value; 97 struct UString comment; 98 uint32_t line; 99 }; 100 101 /* keep in sync with token defines in read.h */ 102 const char *tokenNames[TOK_TOKEN_COUNT] = 103 { 104 "string", /* A string token, such as "MonthNames" */ 105 "'{'", /* An opening brace character */ 106 "'}'", /* A closing brace character */ 107 "','", /* A comma */ 108 "':'", /* A colon */ 109 110 "<end of file>", /* End of the file has been reached successfully */ 111 "<end of line>" 112 }; 113 114 /* Just to store "TRUE" */ 115 //static const char16_t trueValue[] = {0x0054, 0x0052, 0x0055, 0x0045, 0x0000}; 116 117 typedef struct { 118 struct Lookahead lookahead[MAX_LOOKAHEAD + 1]; 119 uint32_t lookaheadPosition; 120 UCHARBUF *buffer; 121 struct SRBRoot *bundle; 122 const char *inputdir; 123 uint32_t inputdirLength; 124 const char *outputdir; 125 uint32_t outputdirLength; 126 const char *filename; 127 UBool makeBinaryCollation; 128 UBool omitCollationRules; 129 UBool icu4xMode; 130 } ParseState; 131 132 typedef struct SResource * 133 ParseResourceFunction(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status); 134 135 static struct SResource *parseResource(ParseState* state, char *tag, const struct UString *comment, UErrorCode *status); 136 137 /* The nature of the lookahead buffer: 138 There are MAX_LOOKAHEAD + 1 slots, used as a circular buffer. This provides 139 MAX_LOOKAHEAD lookahead tokens and a slot for the current token and value. 140 When getToken is called, the current pointer is moved to the next slot and the 141 old slot is filled with the next token from the reader by calling getNextToken. 142 The token values are stored in the slot, which means that token values don't 143 survive a call to getToken, ie. 144 145 UString *value; 146 147 getToken(&value, nullptr, status); 148 getToken(nullptr, nullptr, status); bad - value is now a different string 149 */ 150 static void 151 initLookahead(ParseState* state, UCHARBUF *buf, UErrorCode *status) 152 { 153 static uint32_t initTypeStrings = 0; 154 uint32_t i; 155 156 if (!initTypeStrings) 157 { 158 initTypeStrings = 1; 159 } 160 161 state->lookaheadPosition = 0; 162 state->buffer = buf; 163 164 resetLineNumber(); 165 166 for (i = 0; i < MAX_LOOKAHEAD; i++) 167 { 168 state->lookahead[i].type = getNextToken(state->buffer, &state->lookahead[i].value, &state->lookahead[i].line, &state->lookahead[i].comment, status); 169 if (U_FAILURE(*status)) 170 { 171 return; 172 } 173 } 174 175 *status = U_ZERO_ERROR; 176 } 177 178 static void 179 cleanupLookahead(ParseState* state) 180 { 181 uint32_t i; 182 for (i = 0; i <= MAX_LOOKAHEAD; i++) 183 { 184 ustr_deinit(&state->lookahead[i].value); 185 ustr_deinit(&state->lookahead[i].comment); 186 } 187 188 } 189 190 static enum ETokenType 191 getToken(ParseState* state, struct UString **tokenValue, struct UString* comment, uint32_t *linenumber, UErrorCode *status) 192 { 193 enum ETokenType result; 194 uint32_t i; 195 196 result = state->lookahead[state->lookaheadPosition].type; 197 198 if (tokenValue != nullptr) 199 { 200 *tokenValue = &state->lookahead[state->lookaheadPosition].value; 201 } 202 203 if (linenumber != nullptr) 204 { 205 *linenumber = state->lookahead[state->lookaheadPosition].line; 206 } 207 208 if (comment != nullptr) 209 { 210 ustr_cpy(comment, &(state->lookahead[state->lookaheadPosition].comment), status); 211 } 212 213 i = (state->lookaheadPosition + MAX_LOOKAHEAD) % (MAX_LOOKAHEAD + 1); 214 state->lookaheadPosition = (state->lookaheadPosition + 1) % (MAX_LOOKAHEAD + 1); 215 ustr_setlen(&state->lookahead[i].comment, 0, status); 216 ustr_setlen(&state->lookahead[i].value, 0, status); 217 state->lookahead[i].type = getNextToken(state->buffer, &state->lookahead[i].value, &state->lookahead[i].line, &state->lookahead[i].comment, status); 218 219 /* printf("getToken, returning %s\n", tokenNames[result]); */ 220 221 return result; 222 } 223 224 static enum ETokenType 225 peekToken(ParseState* state, uint32_t lookaheadCount, struct UString **tokenValue, uint32_t *linenumber, struct UString *comment, UErrorCode *status) 226 { 227 uint32_t i = (state->lookaheadPosition + lookaheadCount) % (MAX_LOOKAHEAD + 1); 228 229 if (U_FAILURE(*status)) 230 { 231 return TOK_ERROR; 232 } 233 234 if (lookaheadCount >= MAX_LOOKAHEAD) 235 { 236 *status = U_INTERNAL_PROGRAM_ERROR; 237 return TOK_ERROR; 238 } 239 240 if (tokenValue != nullptr) 241 { 242 *tokenValue = &state->lookahead[i].value; 243 } 244 245 if (linenumber != nullptr) 246 { 247 *linenumber = state->lookahead[i].line; 248 } 249 250 if(comment != nullptr){ 251 ustr_cpy(comment, &(state->lookahead[state->lookaheadPosition].comment), status); 252 } 253 254 return state->lookahead[i].type; 255 } 256 257 static void 258 expect(ParseState* state, enum ETokenType expectedToken, struct UString **tokenValue, struct UString *comment, uint32_t *linenumber, UErrorCode *status) 259 { 260 uint32_t line; 261 262 enum ETokenType token = getToken(state, tokenValue, comment, &line, status); 263 264 if (linenumber != nullptr) 265 { 266 *linenumber = line; 267 } 268 269 if (U_FAILURE(*status)) 270 { 271 return; 272 } 273 274 if (token != expectedToken) 275 { 276 *status = U_INVALID_FORMAT_ERROR; 277 error(line, "expecting %s, got %s", tokenNames[expectedToken], tokenNames[token]); 278 } 279 else 280 { 281 *status = U_ZERO_ERROR; 282 } 283 } 284 285 static char *getInvariantString(ParseState* state, uint32_t *line, struct UString *comment, 286 int32_t &stringLength, UErrorCode *status) 287 { 288 struct UString *tokenValue; 289 char *result; 290 291 expect(state, TOK_STRING, &tokenValue, comment, line, status); 292 293 if (U_FAILURE(*status)) 294 { 295 return nullptr; 296 } 297 298 if(!uprv_isInvariantUString(tokenValue->fChars, tokenValue->fLength)) { 299 *status = U_INVALID_FORMAT_ERROR; 300 error((line == nullptr) ? 0 : *line, "invariant characters required for table keys, binary data, etc."); 301 return nullptr; 302 } 303 304 result = static_cast<char *>(uprv_malloc(tokenValue->fLength+1)); 305 306 if (result == nullptr) 307 { 308 *status = U_MEMORY_ALLOCATION_ERROR; 309 return nullptr; 310 } 311 312 u_UCharsToChars(tokenValue->fChars, result, tokenValue->fLength+1); 313 stringLength = tokenValue->fLength; 314 return result; 315 } 316 317 static struct SResource * 318 parseUCARules(ParseState* state, char *tag, uint32_t startline, const struct UString* /*comment*/, UErrorCode *status) 319 { 320 struct SResource *result = nullptr; 321 struct UString *tokenValue; 322 FileStream *file = nullptr; 323 CharString filename; 324 uint32_t line; 325 UBool quoted = false; 326 UCHARBUF *ucbuf=nullptr; 327 UChar32 c = 0; 328 const char* cp = nullptr; 329 char16_t *pTarget = nullptr; 330 char16_t *target = nullptr; 331 char16_t *targetLimit = nullptr; 332 int32_t size = 0; 333 334 expect(state, TOK_STRING, &tokenValue, nullptr, &line, status); 335 336 if(isVerbose()){ 337 printf(" %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline)); 338 } 339 340 if (U_FAILURE(*status)) 341 { 342 return nullptr; 343 } 344 /* make the filename including the directory */ 345 if (state->inputdir != nullptr) 346 { 347 filename.append(state->inputdir, -1, *status); 348 349 if (state->inputdir[state->inputdirLength - 1] != U_FILE_SEP_CHAR) 350 { 351 filename.append(U_FILE_SEP_CHAR, *status); 352 } 353 } 354 355 filename.appendInvariantChars(tokenValue->fChars, tokenValue->fLength, *status); 356 357 expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status); 358 359 if (U_FAILURE(*status)) 360 { 361 return nullptr; 362 } 363 364 if(state->omitCollationRules) { 365 return res_none(); 366 } 367 368 ucbuf = ucbuf_open(filename.data(), &cp, getShowWarning(),false, status); 369 370 if (U_FAILURE(*status)) { 371 error(line, "An error occurred while opening the input file %s\n", filename.data()); 372 return nullptr; 373 } 374 375 /* We allocate more space than actually required 376 * since the actual size needed for storing UChars 377 * is not known in UTF-8 byte stream 378 */ 379 size = ucbuf_size(ucbuf) + 1; 380 pTarget = static_cast<char16_t*>(uprv_malloc(U_SIZEOF_UCHAR * size)); 381 uprv_memset(pTarget, 0, size*U_SIZEOF_UCHAR); 382 target = pTarget; 383 targetLimit = pTarget+size; 384 385 /* read the rules into the buffer */ 386 while (target < targetLimit) 387 { 388 c = ucbuf_getc(ucbuf, status); 389 if(c == QUOTE) { 390 quoted = static_cast<UBool>(!quoted); 391 } 392 /* weiv (06/26/2002): adding the following: 393 * - preserving spaces in commands [...] 394 * - # comments until the end of line 395 */ 396 if (c == STARTCOMMAND && !quoted) 397 { 398 /* preserve commands 399 * closing bracket will be handled by the 400 * append at the end of the loop 401 */ 402 while(c != ENDCOMMAND) { 403 U_APPEND_CHAR32_ONLY(c, target); 404 c = ucbuf_getc(ucbuf, status); 405 } 406 } 407 else if (c == HASH && !quoted) { 408 /* skip comments */ 409 while(c != CR && c != LF) { 410 c = ucbuf_getc(ucbuf, status); 411 } 412 continue; 413 } 414 else if (c == ESCAPE) 415 { 416 c = unescape(ucbuf, status); 417 418 if (c == static_cast<UChar32>(U_ERR)) 419 { 420 uprv_free(pTarget); 421 T_FileStream_close(file); 422 return nullptr; 423 } 424 } 425 else if (!quoted && (c == SPACE || c == TAB || c == CR || c == LF)) 426 { 427 /* ignore spaces carriage returns 428 * and line feed unless in the form \uXXXX 429 */ 430 continue; 431 } 432 433 /* Append char16_t * after dissembling if c > 0xffff*/ 434 if (c != static_cast<UChar32>(U_EOF)) 435 { 436 U_APPEND_CHAR32_ONLY(c, target); 437 } 438 else 439 { 440 break; 441 } 442 } 443 444 /* terminate the string */ 445 if(target < targetLimit){ 446 *target = 0x0000; 447 } 448 449 result = string_open(state->bundle, tag, pTarget, static_cast<int32_t>(target - pTarget), nullptr, status); 450 451 452 ucbuf_close(ucbuf); 453 uprv_free(pTarget); 454 T_FileStream_close(file); 455 456 return result; 457 } 458 459 static struct SResource * 460 parseTransliterator(ParseState* state, char *tag, uint32_t startline, const struct UString* /*comment*/, UErrorCode *status) 461 { 462 struct SResource *result = nullptr; 463 struct UString *tokenValue; 464 FileStream *file = nullptr; 465 char filename[256] = { '\0' }; 466 char cs[128] = { '\0' }; 467 uint32_t line; 468 UCHARBUF *ucbuf=nullptr; 469 const char* cp = nullptr; 470 char16_t *pTarget = nullptr; 471 const char16_t *pSource = nullptr; 472 int32_t size = 0; 473 474 expect(state, TOK_STRING, &tokenValue, nullptr, &line, status); 475 476 if(isVerbose()){ 477 printf(" %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline)); 478 } 479 480 if (U_FAILURE(*status)) 481 { 482 return nullptr; 483 } 484 /* make the filename including the directory */ 485 if (state->inputdir != nullptr) 486 { 487 uprv_strcat(filename, state->inputdir); 488 489 if (state->inputdir[state->inputdirLength - 1] != U_FILE_SEP_CHAR) 490 { 491 uprv_strcat(filename, U_FILE_SEP_STRING); 492 } 493 } 494 495 u_UCharsToChars(tokenValue->fChars, cs, tokenValue->fLength); 496 497 expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status); 498 499 if (U_FAILURE(*status)) 500 { 501 return nullptr; 502 } 503 uprv_strcat(filename, cs); 504 505 506 ucbuf = ucbuf_open(filename, &cp, getShowWarning(),false, status); 507 508 if (U_FAILURE(*status)) { 509 error(line, "An error occurred while opening the input file %s\n", filename); 510 return nullptr; 511 } 512 513 /* We allocate more space than actually required 514 * since the actual size needed for storing UChars 515 * is not known in UTF-8 byte stream 516 */ 517 pSource = ucbuf_getBuffer(ucbuf, &size, status); 518 pTarget = static_cast<char16_t*>(uprv_malloc(U_SIZEOF_UCHAR * (size + 1))); 519 uprv_memset(pTarget, 0, size*U_SIZEOF_UCHAR); 520 521 #if !UCONFIG_NO_TRANSLITERATION 522 size = utrans_stripRules(pSource, size, pTarget, status); 523 #else 524 size = 0; 525 fprintf(stderr, " Warning: writing empty transliteration data ( UCONFIG_NO_TRANSLITERATION ) \n"); 526 #endif 527 result = string_open(state->bundle, tag, pTarget, size, nullptr, status); 528 529 ucbuf_close(ucbuf); 530 uprv_free(pTarget); 531 T_FileStream_close(file); 532 533 return result; 534 } 535 static ArrayResource* dependencyArray = nullptr; 536 537 static struct SResource * 538 parseDependency(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status) 539 { 540 struct SResource *result = nullptr; 541 struct SResource *elem = nullptr; 542 struct UString *tokenValue; 543 uint32_t line; 544 char filename[256] = { '\0' }; 545 char cs[128] = { '\0' }; 546 547 expect(state, TOK_STRING, &tokenValue, nullptr, &line, status); 548 549 if(isVerbose()){ 550 printf(" %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline)); 551 } 552 553 if (U_FAILURE(*status)) 554 { 555 return nullptr; 556 } 557 /* make the filename including the directory */ 558 if (state->outputdir != nullptr) 559 { 560 uprv_strcat(filename, state->outputdir); 561 562 if (state->outputdir[state->outputdirLength - 1] != U_FILE_SEP_CHAR) 563 { 564 uprv_strcat(filename, U_FILE_SEP_STRING); 565 } 566 } 567 568 u_UCharsToChars(tokenValue->fChars, cs, tokenValue->fLength); 569 570 if (U_FAILURE(*status)) 571 { 572 return nullptr; 573 } 574 uprv_strcat(filename, cs); 575 if(!T_FileStream_file_exists(filename)){ 576 if(isStrict()){ 577 error(line, "The dependency file %s does not exist. Please make sure it exists.\n",filename); 578 }else{ 579 warning(line, "The dependency file %s does not exist. Please make sure it exists.\n",filename); 580 } 581 } 582 if(dependencyArray==nullptr){ 583 dependencyArray = array_open(state->bundle, "%%DEPENDENCY", nullptr, status); 584 } 585 if(tag!=nullptr){ 586 result = string_open(state->bundle, tag, tokenValue->fChars, tokenValue->fLength, comment, status); 587 } 588 elem = string_open(state->bundle, nullptr, tokenValue->fChars, tokenValue->fLength, comment, status); 589 590 dependencyArray->add(elem); 591 592 if (U_FAILURE(*status)) 593 { 594 return nullptr; 595 } 596 expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status); 597 return result; 598 } 599 static struct SResource * 600 parseString(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status) 601 { 602 struct UString *tokenValue; 603 struct SResource *result = nullptr; 604 605 /* if (tag != nullptr && uprv_strcmp(tag, "%%UCARULES") == 0) 606 { 607 return parseUCARules(tag, startline, status); 608 }*/ 609 if(isVerbose()){ 610 printf(" string %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline)); 611 } 612 expect(state, TOK_STRING, &tokenValue, nullptr, nullptr, status); 613 614 if (U_SUCCESS(*status)) 615 { 616 /* create the string now - tokenValue doesn't survive a call to getToken (and therefore 617 doesn't survive expect either) */ 618 619 result = string_open(state->bundle, tag, tokenValue->fChars, tokenValue->fLength, comment, status); 620 if(U_SUCCESS(*status) && result) { 621 expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status); 622 623 if (U_FAILURE(*status)) 624 { 625 res_close(result); 626 return nullptr; 627 } 628 } 629 } 630 631 return result; 632 } 633 634 static struct SResource * 635 parseAlias(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status) 636 { 637 struct UString *tokenValue; 638 struct SResource *result = nullptr; 639 640 expect(state, TOK_STRING, &tokenValue, nullptr, nullptr, status); 641 642 if(isVerbose()){ 643 printf(" alias %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline)); 644 } 645 646 if (U_SUCCESS(*status)) 647 { 648 /* create the string now - tokenValue doesn't survive a call to getToken (and therefore 649 doesn't survive expect either) */ 650 651 result = alias_open(state->bundle, tag, tokenValue->fChars, tokenValue->fLength, comment, status); 652 653 expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status); 654 655 if (U_FAILURE(*status)) 656 { 657 res_close(result); 658 return nullptr; 659 } 660 } 661 662 return result; 663 } 664 665 #if !UCONFIG_NO_COLLATION 666 667 namespace { 668 669 struct SResource* resLookup(struct SResource* res, const char* key) { 670 if (res == res_none() || !res->isTable()) { 671 return nullptr; 672 } 673 674 TableResource *list = static_cast<TableResource *>(res); 675 SResource *current = list->fFirst; 676 while (current != nullptr) { 677 if (uprv_strcmp(((list->fRoot->fKeys) + (current->fKey)), key) == 0) { 678 return current; 679 } 680 current = current->fNext; 681 } 682 return nullptr; 683 } 684 685 class GenrbImporter : public icu::CollationRuleParser::Importer { 686 public: 687 GenrbImporter(const char *in, const char *out) : inputDir(in), outputDir(out) {} 688 virtual ~GenrbImporter(); 689 virtual void getRules( 690 const char *localeID, const char *collationType, 691 UnicodeString &rules, 692 const char *&errorReason, UErrorCode &errorCode) override; 693 694 private: 695 const char *inputDir; 696 const char *outputDir; 697 }; 698 699 GenrbImporter::~GenrbImporter() {} 700 701 void 702 GenrbImporter::getRules( 703 const char *localeID, const char *collationType, 704 UnicodeString &rules, 705 const char *& /*errorReason*/, UErrorCode &errorCode) { 706 CharString filename(localeID, errorCode); 707 for(int32_t i = 0; i < filename.length(); i++){ 708 if(filename[i] == '-'){ 709 filename.data()[i] = '_'; 710 } 711 } 712 filename.append(".txt", errorCode); 713 if (U_FAILURE(errorCode)) { 714 return; 715 } 716 CharString inputDirBuf; 717 CharString openFileName; 718 if(inputDir == nullptr) { 719 const char *filenameBegin = uprv_strrchr(filename.data(), U_FILE_SEP_CHAR); 720 if (filenameBegin != nullptr) { 721 /* 722 * When a filename ../../../data/root.txt is specified, 723 * we presume that the input directory is ../../../data 724 * This is very important when the resource file includes 725 * another file, like UCARules.txt or thaidict.brk. 726 */ 727 StringPiece dir = filename.toStringPiece(); 728 const char *filenameLimit = filename.data() + filename.length(); 729 dir.remove_suffix(static_cast<int32_t>(filenameLimit - filenameBegin)); 730 inputDirBuf.append(dir, errorCode); 731 inputDir = inputDirBuf.data(); 732 } 733 }else{ 734 int32_t dirlen = static_cast<int32_t>(uprv_strlen(inputDir)); 735 736 if((filename[0] != U_FILE_SEP_CHAR) && (inputDir[dirlen-1] !='.')) { 737 /* 738 * append the input dir to openFileName if the first char in 739 * filename is not file separator char and the last char input directory is not '.'. 740 * This is to support : 741 * genrb -s. /home/icu/data 742 * genrb -s. icu/data 743 * The user cannot mix notations like 744 * genrb -s. /icu/data --- the absolute path specified. -s redundant 745 * user should use 746 * genrb -s. icu/data --- start from CWD and look in icu/data dir 747 */ 748 openFileName.append(inputDir, dirlen, errorCode); 749 if(inputDir[dirlen-1] != U_FILE_SEP_CHAR) { 750 openFileName.append(U_FILE_SEP_CHAR, errorCode); 751 } 752 } 753 } 754 openFileName.append(filename, errorCode); 755 if(U_FAILURE(errorCode)) { 756 return; 757 } 758 // printf("GenrbImporter::getRules(%s, %s) reads %s\n", localeID, collationType, openFileName.data()); 759 const char* cp = ""; 760 LocalUCHARBUFPointer ucbuf( 761 ucbuf_open(openFileName.data(), &cp, getShowWarning(), true, &errorCode)); 762 if(errorCode == U_FILE_ACCESS_ERROR) { 763 fprintf(stderr, "couldn't open file %s\n", openFileName.data()); 764 return; 765 } 766 if (ucbuf.isNull() || U_FAILURE(errorCode)) { 767 fprintf(stderr, "An error occurred processing file %s. Error: %s\n", openFileName.data(), u_errorName(errorCode)); 768 return; 769 } 770 771 /* Parse the data into an SRBRoot */ 772 LocalPointer<SRBRoot> data( 773 parse(ucbuf.getAlias(), inputDir, outputDir, filename.data(), false, false, false, &errorCode)); 774 if (U_FAILURE(errorCode)) { 775 return; 776 } 777 778 struct SResource *root = data->fRoot; 779 struct SResource *collations = resLookup(root, "collations"); 780 if (collations != nullptr) { 781 struct SResource *collation = resLookup(collations, collationType); 782 if (collation != nullptr) { 783 struct SResource *sequence = resLookup(collation, "Sequence"); 784 if (sequence != nullptr && sequence->isString()) { 785 // No string pointer aliasing so that we need not hold onto the resource bundle. 786 StringResource *sr = static_cast<StringResource *>(sequence); 787 rules = sr->fString; 788 } 789 } 790 } 791 } 792 793 // Quick-and-dirty escaping function. 794 // Assumes that we are on an ASCII-based platform. 795 void 796 escape(const char16_t *s, char *buffer, size_t n) { 797 int32_t length = u_strlen(s); 798 int32_t i = 0; 799 for (;;) { 800 UChar32 c; 801 U16_NEXT(s, i, length, c); 802 if (c == 0) { 803 *buffer = 0; 804 return; 805 } else if (0x20 <= c && c <= 0x7e) { 806 // printable ASCII 807 *buffer++ = static_cast<char>(c); // assumes ASCII-based platform 808 } else { 809 buffer += snprintf(buffer, n, "\\u%04X", static_cast<int>(c)); 810 } 811 } 812 } 813 814 } // namespace 815 816 static FILE* 817 openTOML(const char* outputdir, const char* name, const char* collationType, const char* structType, UErrorCode *status) { 818 CharString baseName; 819 baseName.append(name, *status); 820 baseName.append("_", *status); 821 baseName.append(collationType, *status); 822 baseName.append("_", *status); 823 baseName.append(structType, *status); 824 825 CharString outFileName; 826 if (outputdir && *outputdir) { 827 outFileName.append(outputdir, *status).ensureEndsWithFileSeparator(*status); 828 } 829 outFileName.append(baseName, *status); 830 outFileName.append(".toml", *status); 831 if (U_FAILURE(*status)) { 832 return nullptr; 833 } 834 835 FILE* f = fopen(outFileName.data(), "w"); 836 if (!f) { 837 *status = U_FILE_ACCESS_ERROR; 838 return nullptr; 839 } 840 usrc_writeFileNameGeneratedBy(f, "#", baseName.data(), "genrb -X"); 841 842 return f; 843 } 844 845 static void 846 writeCollationMetadataTOML(const char* outputdir, const char* name, const char* collationType, const uint32_t metadataBits, UErrorCode *status) { 847 FILE* f = openTOML(outputdir, name, collationType, "meta", status); 848 if (!f) { 849 return; 850 } 851 // printf("writeCollationMetadataTOML %s %s\n", name, collationType); 852 fprintf(f, "bits = 0x%X\n", metadataBits); 853 fclose(f); 854 } 855 856 static UChar32 857 writeCollationDiacriticsTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UErrorCode *status) { 858 UChar32 limit = ICU4X_DIACRITIC_LIMIT; 859 FILE* f = openTOML(outputdir, name, collationType, "dia", status); 860 if (!f) { 861 return limit; 862 } 863 // printf("writeCollationDiacriticsTOML %s %s\n", name, collationType); 864 uint16_t secondaries[ICU4X_DIACRITIC_LIMIT-ICU4X_DIACRITIC_BASE]; 865 for (UChar32 c = ICU4X_DIACRITIC_BASE; c < ICU4X_DIACRITIC_LIMIT; ++c) { 866 uint16_t secondary = 0; 867 uint32_t ce32 = data->getCE32(c); 868 if (ce32 == icu::Collation::FALLBACK_CE32) { 869 ce32 = data->base->getCE32(c); 870 } 871 if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) { 872 // These never occur in NFD data 873 } else if (!icu::Collation::isSimpleOrLongCE32(ce32)) { 874 if (uprv_strcmp(name, "root") == 0) { 875 printf("UNSUPPORTED DIACRITIC CE32 in root: TAG: %X CE32: %X char: %X\n", icu::Collation::tagFromCE32(ce32), ce32, c); 876 fclose(f); 877 *status = U_INTERNAL_PROGRAM_ERROR; 878 return limit; 879 } 880 limit = c; 881 break; 882 } else { 883 uint64_t ce = static_cast<uint64_t>(icu::Collation::ceFromCE32(ce32)); 884 if ((ce & 0xFFFFFFFF0000FFFF) != static_cast<uint64_t>(icu::Collation::COMMON_TERTIARY_CE)) { 885 // Not a CE where only the secondary weight differs from the expected 886 // pattern. 887 limit = c; 888 break; 889 } 890 secondary = static_cast<uint16_t>(ce >> 16); 891 } 892 secondaries[c - ICU4X_DIACRITIC_BASE] = secondary; 893 894 } 895 usrc_writeArray(f, "secondaries = [\n ", secondaries, 16, limit-ICU4X_DIACRITIC_BASE, " ", "\n]\n"); 896 fclose(f); 897 return limit; 898 } 899 900 static void 901 writeCollationReorderingTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationSettings* settings, UErrorCode *status) { 902 FILE* f = openTOML(outputdir, name, collationType, "reord", status); 903 if (!f) { 904 return; 905 } 906 // printf("writeCollationReorderingTOML %s %s\n", name, collationType); 907 fprintf(f, "min_high_no_reorder = 0x%X\n", settings->minHighNoReorder); 908 usrc_writeArray(f, "reorder_table = [\n ", settings->reorderTable, 8, 256, " ", "\n]\n"); 909 usrc_writeArray(f, "reorder_ranges = [\n ", settings->reorderRanges, 32, settings->reorderRangesLength, " ", "\n]\n"); 910 fclose(f); 911 } 912 913 914 static void 915 writeCollationJamoTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UErrorCode *status) { 916 FILE* f = openTOML(outputdir, name, collationType, "jamo", status); 917 if (!f) { 918 printf("writeCollationJamoTOML FAILED TO OPEN FILE %s %s\n", name, collationType); 919 return; 920 } 921 uint32_t jamo[0x1200-0x1100]; 922 for (UChar32 c = 0x1100; c < 0x1200; ++c) { 923 uint32_t ce32 = data->getCE32(c); 924 if (ce32 == icu::Collation::FALLBACK_CE32) { 925 ce32 = data->base->getCE32(c); 926 } 927 // Can't reject complex CE32s, because search collations have expansions. 928 // These expansions refer to the tailoring, which foils the reuse of the 929 // these jamo tables. 930 // XXX Figure out what to do. Perhaps instead of having Latin mini expansions, 931 // there should be Hangul mini expansions. 932 // XXX in any case, validate that modern jamo are self-contained. 933 jamo[c - 0x1100] = ce32; 934 935 } 936 usrc_writeArray(f, "ce32s = [\n ", jamo, 32, 0x1200-0x1100, " ", "\n]\n"); 937 fclose(f); 938 } 939 940 static UBool 941 convertTrie(const void *context, UChar32 start, UChar32 end, uint32_t value) { 942 if (start >= 0x1100 && start < 0x1200 && end >= 0x1100 && end < 0x1200) { 943 // Range entirely in conjoining jamo block. 944 return true; 945 } 946 icu::IcuToolErrorCode status("genrb: convertTrie"); 947 umutablecptrie_setRange((UMutableCPTrie*)context, start, end, value, status); 948 return !U_FAILURE(*status); 949 } 950 951 static void 952 writeCollationDataTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UBool root, UChar32 diacriticLimit, UErrorCode *status) { 953 FILE* f = openTOML(outputdir, name, collationType, "data", status); 954 if (!f) { 955 return; 956 } 957 // printf("writeCollationDataTOML %s %s\n", name, collationType); 958 959 icu::UnicodeSet tailoringSet; 960 961 if (data->base) { 962 tailoringSet.addAll(*(data->unsafeBackwardSet)); 963 tailoringSet.removeAll(*(data->base->unsafeBackwardSet)); 964 } else { 965 tailoringSet.addAll(*(data->unsafeBackwardSet)); 966 } 967 968 // Use the same value for out-of-range and default in the hope of not having to allocate 969 // different blocks, since ICU4X never does out-of-range queries. 970 uint32_t trieDefault = root ? icu::Collation::UNASSIGNED_CE32 : icu::Collation::FALLBACK_CE32; 971 icu::LocalUMutableCPTriePointer builder(umutablecptrie_open(trieDefault, trieDefault, status)); 972 973 utrie2_enum(data->trie, nullptr, &convertTrie, builder.getAlias()); 974 975 // If the diacritic table was cut short, copy CE32s between the lowered 976 // limit and the max limit from the root to the tailoring. As of June 2022, 977 // no collation in CLDR needs this. 978 for (UChar32 c = diacriticLimit; c < ICU4X_DIACRITIC_LIMIT; ++c) { 979 if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) { 980 // These never occur in NFD data. 981 continue; 982 } 983 uint32_t ce32 = data->getCE32(c); 984 if (ce32 == icu::Collation::FALLBACK_CE32) { 985 ce32 = data->base->getCE32(c); 986 umutablecptrie_set(builder.getAlias(), c, ce32, status); 987 } 988 } 989 990 // Ensure that the range covered by the diacritic table isn't duplicated 991 // in the trie. 992 for (UChar32 c = ICU4X_DIACRITIC_BASE; c < diacriticLimit; ++c) { 993 if (umutablecptrie_get(builder.getAlias(), c) != trieDefault) { 994 umutablecptrie_set(builder.getAlias(), c, trieDefault, status); 995 } 996 } 997 998 icu::LocalUCPTriePointer utrie(umutablecptrie_buildImmutable( 999 builder.getAlias(), 1000 UCPTRIE_TYPE_SMALL, 1001 UCPTRIE_VALUE_BITS_32, 1002 status)); 1003 usrc_writeArray(f, "contexts = [\n ", data->contexts, 16, data->contextsLength, " ", "\n]\n"); 1004 usrc_writeArray(f, "ce32s = [\n ", data->ce32s, 32, data->ce32sLength, " ", "\n]\n"); 1005 usrc_writeArray(f, "ces = [\n ", data->ces, 64, data->cesLength, " ", "\n]\n"); 1006 fprintf(f, "[trie]\n"); 1007 usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML); 1008 1009 fclose(f); 1010 } 1011 1012 static void 1013 writeCollationSpecialPrimariesTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UErrorCode *status) { 1014 FILE* f = openTOML(outputdir, name, collationType, "prim", status); 1015 if (!f) { 1016 return; 1017 } 1018 // printf("writeCollationSpecialPrimariesTOML %s %s\n", name, collationType); 1019 1020 uint16_t lastPrimaries[4]; 1021 for (int32_t i = 0; i < 4; ++i) { 1022 // getLastPrimaryForGroup subtracts one from a 16-bit value, so we add one 1023 // back to get a value that fits in 16 bits. 1024 lastPrimaries[i] = static_cast<uint16_t>((data->getLastPrimaryForGroup(UCOL_REORDER_CODE_FIRST + i) + 1) >> 16); 1025 } 1026 1027 uint32_t numericPrimary = data->numericPrimary; 1028 if (numericPrimary & 0xFFFFFF) { 1029 printf("Lower 24 bits set in numeric primary"); 1030 *status = U_INTERNAL_PROGRAM_ERROR; 1031 return; 1032 } 1033 1034 usrc_writeArray(f, "last_primaries = [\n ", lastPrimaries, 16, 4, " ", "\n]\n"); 1035 usrc_writeArray(f, "compressible_bytes = [\n ", data->compressibleBytes, 1, 256, " ", "\n]\n"); 1036 fprintf(f, "numeric_primary = 0x%X\n", numericPrimary >> 24); 1037 fclose(f); 1038 } 1039 1040 static void 1041 writeCollationTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, const icu::CollationSettings* settings, UErrorCode *status) { 1042 UBool tailored = false; 1043 UBool tailoredDiacritics = false; 1044 UBool lithuanianDotAbove = (uprv_strcmp(name, "lt") == 0); 1045 UBool reordering = false; 1046 UBool isRoot = uprv_strcmp(name, "root") == 0; 1047 UChar32 diacriticLimit = ICU4X_DIACRITIC_LIMIT; 1048 if (!data->base && isRoot) { 1049 diacriticLimit = writeCollationDiacriticsTOML(outputdir, name, collationType, data, status); 1050 if (U_FAILURE(*status)) { 1051 return; 1052 } 1053 writeCollationJamoTOML(outputdir, name, collationType, data, status); 1054 if (U_FAILURE(*status)) { 1055 return; 1056 } 1057 writeCollationSpecialPrimariesTOML(outputdir, name, collationType, data, status); 1058 if (U_FAILURE(*status)) { 1059 return; 1060 } 1061 } else if (data->base && !lithuanianDotAbove) { 1062 for (UChar32 c = ICU4X_DIACRITIC_BASE; c < ICU4X_DIACRITIC_LIMIT; ++c) { 1063 if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) { 1064 // These never occur in NFD data. 1065 continue; 1066 } 1067 uint32_t ce32 = data->getCE32(c); 1068 if ((ce32 != icu::Collation::FALLBACK_CE32) && (ce32 != data->base->getCE32(c))) { 1069 tailoredDiacritics = true; 1070 diacriticLimit = writeCollationDiacriticsTOML(outputdir, name, collationType, data, status); 1071 if (U_FAILURE(*status)) { 1072 return; 1073 } 1074 break; 1075 } 1076 } 1077 } 1078 1079 if (settings->hasReordering()) { 1080 reordering = true; 1081 // Note: There are duplicate reorderings. Expecting the ICU4X provider 1082 // to take care of deduplication. 1083 writeCollationReorderingTOML(outputdir, name, collationType, settings, status); 1084 if (U_FAILURE(*status)) { 1085 return; 1086 } 1087 } 1088 1089 // Write collation data if either base is non-null or the name is root. 1090 // Languages that only reorder scripts are otherwise root-like and have 1091 // null base. 1092 if (data->base || isRoot) { 1093 tailored = !isRoot; 1094 writeCollationDataTOML(outputdir, name, collationType, data, (!data->base && isRoot), diacriticLimit, status); 1095 if (U_FAILURE(*status)) { 1096 return; 1097 } 1098 } 1099 1100 uint32_t maxVariable = static_cast<uint32_t>(settings->getMaxVariable()); 1101 if (maxVariable >= 4) { 1102 printf("Max variable out of range"); 1103 *status = U_INTERNAL_PROGRAM_ERROR; 1104 return; 1105 } 1106 1107 uint32_t metadataBits = maxVariable; 1108 if (tailored) { 1109 metadataBits |= (1 << 3); 1110 } 1111 if (tailoredDiacritics) { 1112 metadataBits |= (1 << 4); 1113 } 1114 if (reordering) { 1115 metadataBits |= (1 << 5); 1116 } 1117 if (lithuanianDotAbove) { 1118 metadataBits |= (1 << 6); 1119 } 1120 if ((settings->options & icu::CollationSettings::BACKWARD_SECONDARY) != 0) { 1121 metadataBits |= (1 << 7); 1122 } 1123 if (settings->getAlternateHandling() == UCOL_SHIFTED) { 1124 metadataBits |= (1 << 8); 1125 } 1126 switch (settings->getCaseFirst()) { 1127 case UCOL_OFF: 1128 break; 1129 case UCOL_UPPER_FIRST: 1130 metadataBits |= (1 << 9); 1131 metadataBits |= (1 << 10); 1132 break; 1133 case UCOL_LOWER_FIRST: 1134 metadataBits |= (1 << 9); 1135 break; 1136 default: 1137 *status = U_INTERNAL_PROGRAM_ERROR; 1138 return; 1139 } 1140 1141 writeCollationMetadataTOML(outputdir, name, collationType, metadataBits, status); 1142 } 1143 1144 #endif // !UCONFIG_NO_COLLATION 1145 1146 static TableResource * 1147 addCollation(ParseState* state, TableResource *result, const char *collationType, 1148 uint32_t startline, UErrorCode *status) 1149 { 1150 // TODO: Use LocalPointer for result, or make caller close it when there is a failure. 1151 struct SResource *member = nullptr; 1152 struct UString *tokenValue; 1153 struct UString comment; 1154 enum ETokenType token; 1155 CharString subtag; 1156 UnicodeString rules; 1157 UBool haveRules = false; 1158 UVersionInfo version; 1159 uint32_t line; 1160 1161 /* '{' . (name resource)* '}' */ 1162 version[0]=0; version[1]=0; version[2]=0; version[3]=0; 1163 1164 for (;;) 1165 { 1166 ustr_init(&comment); 1167 token = getToken(state, &tokenValue, &comment, &line, status); 1168 1169 if (token == TOK_CLOSE_BRACE) 1170 { 1171 break; 1172 } 1173 1174 if (token != TOK_STRING) 1175 { 1176 res_close(result); 1177 *status = U_INVALID_FORMAT_ERROR; 1178 1179 if (token == TOK_EOF) 1180 { 1181 error(startline, "unterminated table"); 1182 } 1183 else 1184 { 1185 error(line, "Unexpected token %s", tokenNames[token]); 1186 } 1187 1188 return nullptr; 1189 } 1190 1191 subtag.clear(); 1192 subtag.appendInvariantChars(tokenValue->fChars, u_strlen(tokenValue->fChars), *status); 1193 if (U_FAILURE(*status)) 1194 { 1195 res_close(result); 1196 return nullptr; 1197 } 1198 1199 member = parseResource(state, subtag.data(), nullptr, status); 1200 1201 if (U_FAILURE(*status)) 1202 { 1203 res_close(result); 1204 return nullptr; 1205 } 1206 if (result == nullptr) 1207 { 1208 // Ignore the parsed resources, continue parsing. 1209 } 1210 else if (uprv_strcmp(subtag.data(), "Version") == 0 && member->isString()) 1211 { 1212 StringResource *sr = static_cast<StringResource *>(member); 1213 char ver[40]; 1214 int32_t length = sr->length(); 1215 1216 if (length >= UPRV_LENGTHOF(ver)) 1217 { 1218 length = UPRV_LENGTHOF(ver) - 1; 1219 } 1220 1221 sr->fString.extract(0, length, ver, UPRV_LENGTHOF(ver), US_INV); 1222 u_versionFromString(version, ver); 1223 1224 result->add(member, line, *status); 1225 member = nullptr; 1226 } 1227 else if(uprv_strcmp(subtag.data(), "%%CollationBin")==0) 1228 { 1229 /* discard duplicate %%CollationBin if any*/ 1230 } 1231 else if (uprv_strcmp(subtag.data(), "Sequence") == 0 && member->isString()) 1232 { 1233 StringResource *sr = static_cast<StringResource *>(member); 1234 rules = sr->fString; 1235 haveRules = true; 1236 // Defer building the collator until we have seen 1237 // all sub-elements of the collation table, including the Version. 1238 /* in order to achieve smaller data files, we can direct genrb */ 1239 /* to omit collation rules */ 1240 if(!state->omitCollationRules) { 1241 result->add(member, line, *status); 1242 member = nullptr; 1243 } 1244 } 1245 else // Just copy non-special items. 1246 { 1247 result->add(member, line, *status); 1248 member = nullptr; 1249 } 1250 res_close(member); // TODO: use LocalPointer 1251 if (U_FAILURE(*status)) 1252 { 1253 res_close(result); 1254 return nullptr; 1255 } 1256 } 1257 1258 if (!haveRules) { return result; } 1259 1260 #if UCONFIG_NO_COLLATION || UCONFIG_NO_FILE_IO 1261 warning(line, "Not building collation elements because of UCONFIG_NO_COLLATION and/or UCONFIG_NO_FILE_IO, see uconfig.h"); 1262 (void)collationType; 1263 #else 1264 // CLDR ticket #3949, ICU ticket #8082: 1265 // Do not build collation binary data for for-import-only "private" collation rule strings. 1266 if (uprv_strncmp(collationType, "private-", 8) == 0) { 1267 if(isVerbose()) { 1268 printf("Not building %s~%s collation binary\n", state->filename, collationType); 1269 } 1270 return result; 1271 } 1272 1273 if(!state->makeBinaryCollation) { 1274 if(isVerbose()) { 1275 printf("Not building %s~%s collation binary\n", state->filename, collationType); 1276 } 1277 return result; 1278 } 1279 UErrorCode intStatus = U_ZERO_ERROR; 1280 UParseError parseError; 1281 uprv_memset(&parseError, 0, sizeof(parseError)); 1282 GenrbImporter importer(state->inputdir, state->outputdir); 1283 const icu::CollationTailoring *base = icu::CollationRoot::getRoot(intStatus); 1284 if(U_FAILURE(intStatus)) { 1285 error(line, "failed to load root collator (ucadata.icu) - %s", u_errorName(intStatus)); 1286 res_close(result); 1287 return nullptr; // TODO: use LocalUResourceBundlePointer for result 1288 } 1289 icu::CollationBuilder builder(base, state->icu4xMode, intStatus); 1290 if(state->icu4xMode || (uprv_strncmp(collationType, "search", 6) == 0)) { 1291 builder.disableFastLatin(); // build fast-Latin table unless search collator or ICU4X 1292 } 1293 LocalPointer<icu::CollationTailoring> t( 1294 builder.parseAndBuild(rules, version, &importer, &parseError, intStatus)); 1295 if(U_FAILURE(intStatus)) { 1296 const char *reason = builder.getErrorReason(); 1297 if(reason == nullptr) { reason = ""; } 1298 error(line, "CollationBuilder failed at %s~%s/Sequence rule offset %ld: %s %s", 1299 state->filename, collationType, 1300 static_cast<long>(parseError.offset), u_errorName(intStatus), reason); 1301 if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) { 1302 // Print pre- and post-context. 1303 char preBuffer[100], postBuffer[100]; 1304 escape(parseError.preContext, preBuffer, sizeof(preBuffer)); 1305 escape(parseError.postContext, postBuffer, sizeof(postBuffer)); 1306 error(line, " error context: \"...%s\" ! \"%s...\"", preBuffer, postBuffer); 1307 } 1308 if(isStrict() || t.isNull()) { 1309 *status = intStatus; 1310 res_close(result); 1311 return nullptr; 1312 } 1313 } 1314 if (state->icu4xMode) { 1315 char *nameWithoutSuffix = static_cast<char *>(uprv_malloc(uprv_strlen(state->filename) + 1)); 1316 if (nameWithoutSuffix == nullptr) { 1317 *status = U_MEMORY_ALLOCATION_ERROR; 1318 res_close(result); 1319 return nullptr; 1320 } 1321 uprv_strcpy(nameWithoutSuffix, state->filename); 1322 *uprv_strrchr(nameWithoutSuffix, '.') = 0; 1323 1324 writeCollationTOML(state->outputdir, nameWithoutSuffix, collationType, t->data, t->settings, status); 1325 uprv_free(nameWithoutSuffix); 1326 } 1327 icu::LocalMemory<uint8_t> buffer; 1328 int32_t capacity = 100000; 1329 uint8_t *dest = buffer.allocateInsteadAndCopy(capacity); 1330 if(dest == nullptr) { 1331 fprintf(stderr, "memory allocation (%ld bytes) for file contents failed\n", 1332 static_cast<long>(capacity)); 1333 *status = U_MEMORY_ALLOCATION_ERROR; 1334 res_close(result); 1335 return nullptr; 1336 } 1337 int32_t indexes[icu::CollationDataReader::IX_TOTAL_SIZE + 1]; 1338 int32_t totalSize = icu::CollationDataWriter::writeTailoring( 1339 *t, *t->settings, indexes, dest, capacity, intStatus); 1340 if(intStatus == U_BUFFER_OVERFLOW_ERROR) { 1341 intStatus = U_ZERO_ERROR; 1342 capacity = totalSize; 1343 dest = buffer.allocateInsteadAndCopy(capacity); 1344 if(dest == nullptr) { 1345 fprintf(stderr, "memory allocation (%ld bytes) for file contents failed\n", 1346 static_cast<long>(capacity)); 1347 *status = U_MEMORY_ALLOCATION_ERROR; 1348 res_close(result); 1349 return nullptr; 1350 } 1351 totalSize = icu::CollationDataWriter::writeTailoring( 1352 *t, *t->settings, indexes, dest, capacity, intStatus); 1353 } 1354 if(U_FAILURE(intStatus)) { 1355 fprintf(stderr, "CollationDataWriter::writeTailoring() failed: %s\n", 1356 u_errorName(intStatus)); 1357 res_close(result); 1358 return nullptr; 1359 } 1360 if(isVerbose()) { 1361 printf("%s~%s collation tailoring part sizes:\n", state->filename, collationType); 1362 icu::CollationInfo::printSizes(totalSize, indexes); 1363 if(t->settings->hasReordering()) { 1364 printf("%s~%s collation reordering ranges:\n", state->filename, collationType); 1365 icu::CollationInfo::printReorderRanges( 1366 *t->data, t->settings->reorderCodes, t->settings->reorderCodesLength); 1367 } 1368 #if 0 // debugging output 1369 } else { 1370 printf("%s~%s collation tailoring part sizes:\n", state->filename, collationType); 1371 icu::CollationInfo::printSizes(totalSize, indexes); 1372 #endif 1373 } 1374 struct SResource *collationBin = bin_open(state->bundle, "%%CollationBin", totalSize, dest, nullptr, nullptr, status); 1375 result->add(collationBin, line, *status); 1376 if (U_FAILURE(*status)) { 1377 res_close(result); 1378 return nullptr; 1379 } 1380 #endif 1381 return result; 1382 } 1383 1384 static UBool 1385 keepCollationType(const char * /*type*/) { 1386 return true; 1387 } 1388 1389 static struct SResource * 1390 parseCollationElements(ParseState* state, char *tag, uint32_t startline, UBool newCollation, UErrorCode *status) 1391 { 1392 TableResource *result = nullptr; 1393 struct SResource *member = nullptr; 1394 struct UString *tokenValue; 1395 struct UString comment; 1396 enum ETokenType token; 1397 CharString subtag, typeKeyword; 1398 uint32_t line; 1399 1400 result = table_open(state->bundle, tag, nullptr, status); 1401 1402 if (result == nullptr || U_FAILURE(*status)) 1403 { 1404 return nullptr; 1405 } 1406 if(isVerbose()){ 1407 printf(" collation elements %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline)); 1408 } 1409 if(!newCollation) { 1410 return addCollation(state, result, "(no type)", startline, status); 1411 } 1412 else { 1413 for(;;) { 1414 ustr_init(&comment); 1415 token = getToken(state, &tokenValue, &comment, &line, status); 1416 1417 if (token == TOK_CLOSE_BRACE) 1418 { 1419 return result; 1420 } 1421 1422 if (token != TOK_STRING) 1423 { 1424 res_close(result); 1425 *status = U_INVALID_FORMAT_ERROR; 1426 1427 if (token == TOK_EOF) 1428 { 1429 error(startline, "unterminated table"); 1430 } 1431 else 1432 { 1433 error(line, "Unexpected token %s", tokenNames[token]); 1434 } 1435 1436 return nullptr; 1437 } 1438 1439 subtag.clear(); 1440 subtag.appendInvariantChars(tokenValue->fChars, u_strlen(tokenValue->fChars), *status); 1441 1442 if (U_FAILURE(*status)) 1443 { 1444 res_close(result); 1445 return nullptr; 1446 } 1447 1448 if (uprv_strcmp(subtag.data(), "default") == 0) 1449 { 1450 member = parseResource(state, subtag.data(), nullptr, status); 1451 1452 if (U_FAILURE(*status)) 1453 { 1454 res_close(result); 1455 return nullptr; 1456 } 1457 1458 result->add(member, line, *status); 1459 } 1460 else 1461 { 1462 token = peekToken(state, 0, &tokenValue, &line, &comment, status); 1463 /* this probably needs to be refactored or recursively use the parser */ 1464 /* first we assume that our collation table won't have the explicit type */ 1465 /* then, we cannot handle aliases */ 1466 if(token == TOK_OPEN_BRACE) { 1467 token = getToken(state, &tokenValue, &comment, &line, status); 1468 TableResource *collationRes; 1469 if (keepCollationType(subtag.data())) { 1470 collationRes = table_open(state->bundle, subtag.data(), nullptr, status); 1471 } else { 1472 collationRes = nullptr; 1473 } 1474 // need to parse the collation data regardless 1475 collationRes = addCollation(state, collationRes, subtag.data(), startline, status); 1476 if (collationRes != nullptr) { 1477 result->add(collationRes, startline, *status); 1478 } 1479 } else if(token == TOK_COLON) { /* right now, we'll just try to see if we have aliases */ 1480 /* we could have a table too */ 1481 token = peekToken(state, 1, &tokenValue, &line, &comment, status); 1482 typeKeyword.clear(); 1483 typeKeyword.appendInvariantChars(tokenValue->fChars, u_strlen(tokenValue->fChars), *status); 1484 if (U_FAILURE(*status)) 1485 { 1486 res_close(result); 1487 return nullptr; 1488 } 1489 1490 if(uprv_strcmp(typeKeyword.data(), "alias") == 0) { 1491 member = parseResource(state, subtag.data(), nullptr, status); 1492 if (U_FAILURE(*status)) 1493 { 1494 res_close(result); 1495 return nullptr; 1496 } 1497 1498 result->add(member, line, *status); 1499 } else { 1500 res_close(result); 1501 *status = U_INVALID_FORMAT_ERROR; 1502 return nullptr; 1503 } 1504 } else { 1505 res_close(result); 1506 *status = U_INVALID_FORMAT_ERROR; 1507 return nullptr; 1508 } 1509 } 1510 1511 /*member = string_open(bundle, subtag, tokenValue->fChars, tokenValue->fLength, status);*/ 1512 1513 /*expect(TOK_CLOSE_BRACE, nullptr, nullptr, status);*/ 1514 1515 if (U_FAILURE(*status)) 1516 { 1517 res_close(result); 1518 return nullptr; 1519 } 1520 } 1521 } 1522 } 1523 1524 /* Necessary, because CollationElements requires the bundle->fRoot member to be present which, 1525 if this weren't special-cased, wouldn't be set until the entire file had been processed. */ 1526 static struct SResource * 1527 realParseTable(ParseState* state, TableResource *table, char *tag, uint32_t startline, UErrorCode *status) 1528 { 1529 struct SResource *member = nullptr; 1530 struct UString *tokenValue=nullptr; 1531 struct UString comment; 1532 enum ETokenType token; 1533 CharString subtag; 1534 uint32_t line; 1535 UBool readToken = false; 1536 1537 /* '{' . (name resource)* '}' */ 1538 1539 if(isVerbose()){ 1540 printf(" parsing table %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline)); 1541 } 1542 for (;;) 1543 { 1544 ustr_init(&comment); 1545 token = getToken(state, &tokenValue, &comment, &line, status); 1546 1547 if (token == TOK_CLOSE_BRACE) 1548 { 1549 if (!readToken && isVerbose()) { 1550 warning(startline, "Encountered empty table"); 1551 } 1552 return table; 1553 } 1554 1555 if (token != TOK_STRING) 1556 { 1557 *status = U_INVALID_FORMAT_ERROR; 1558 1559 if (token == TOK_EOF) 1560 { 1561 error(startline, "unterminated table"); 1562 } 1563 else 1564 { 1565 error(line, "unexpected token %s", tokenNames[token]); 1566 } 1567 1568 return nullptr; 1569 } 1570 1571 if(uprv_isInvariantUString(tokenValue->fChars, -1)) { 1572 subtag.clear(); 1573 subtag.appendInvariantChars(tokenValue->fChars, u_strlen(tokenValue->fChars), *status); 1574 } else { 1575 *status = U_INVALID_FORMAT_ERROR; 1576 error(line, "invariant characters required for table keys"); 1577 return nullptr; 1578 } 1579 1580 if (U_FAILURE(*status)) 1581 { 1582 error(line, "parse error. Stopped parsing tokens with %s", u_errorName(*status)); 1583 return nullptr; 1584 } 1585 1586 member = parseResource(state, subtag.data(), &comment, status); 1587 1588 if (member == nullptr || U_FAILURE(*status)) 1589 { 1590 error(line, "parse error. Stopped parsing resource with %s", u_errorName(*status)); 1591 return nullptr; 1592 } 1593 1594 table->add(member, line, *status); 1595 1596 if (U_FAILURE(*status)) 1597 { 1598 error(line, "parse error. Stopped parsing table with %s", u_errorName(*status)); 1599 return nullptr; 1600 } 1601 readToken = true; 1602 ustr_deinit(&comment); 1603 } 1604 1605 /* not reached */ 1606 /* A compiler warning will appear if all paths don't contain a return statement. */ 1607 /* *status = U_INTERNAL_PROGRAM_ERROR; 1608 return nullptr;*/ 1609 } 1610 1611 static struct SResource * 1612 parseTable(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status) 1613 { 1614 if (tag != nullptr && uprv_strcmp(tag, "CollationElements") == 0) 1615 { 1616 return parseCollationElements(state, tag, startline, false, status); 1617 } 1618 if (tag != nullptr && uprv_strcmp(tag, "collations") == 0) 1619 { 1620 return parseCollationElements(state, tag, startline, true, status); 1621 } 1622 if(isVerbose()){ 1623 printf(" table %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline)); 1624 } 1625 1626 TableResource *result = table_open(state->bundle, tag, comment, status); 1627 1628 if (result == nullptr || U_FAILURE(*status)) 1629 { 1630 return nullptr; 1631 } 1632 return realParseTable(state, result, tag, startline, status); 1633 } 1634 1635 static struct SResource * 1636 parseArray(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status) 1637 { 1638 struct SResource *member = nullptr; 1639 struct UString *tokenValue; 1640 struct UString memberComments; 1641 enum ETokenType token; 1642 UBool readToken = false; 1643 1644 ArrayResource *result = array_open(state->bundle, tag, comment, status); 1645 1646 if (result == nullptr || U_FAILURE(*status)) 1647 { 1648 return nullptr; 1649 } 1650 if(isVerbose()){ 1651 printf(" array %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline)); 1652 } 1653 1654 ustr_init(&memberComments); 1655 1656 /* '{' . resource [','] '}' */ 1657 for (;;) 1658 { 1659 /* reset length */ 1660 ustr_setlen(&memberComments, 0, status); 1661 1662 /* check for end of array, but don't consume next token unless it really is the end */ 1663 token = peekToken(state, 0, &tokenValue, nullptr, &memberComments, status); 1664 1665 1666 if (token == TOK_CLOSE_BRACE) 1667 { 1668 getToken(state, nullptr, nullptr, nullptr, status); 1669 if (!readToken) { 1670 warning(startline, "Encountered empty array"); 1671 } 1672 break; 1673 } 1674 1675 if (token == TOK_EOF) 1676 { 1677 res_close(result); 1678 *status = U_INVALID_FORMAT_ERROR; 1679 error(startline, "unterminated array"); 1680 return nullptr; 1681 } 1682 1683 /* string arrays are a special case */ 1684 if (token == TOK_STRING) 1685 { 1686 getToken(state, &tokenValue, &memberComments, nullptr, status); 1687 member = string_open(state->bundle, nullptr, tokenValue->fChars, tokenValue->fLength, &memberComments, status); 1688 } 1689 else 1690 { 1691 member = parseResource(state, nullptr, &memberComments, status); 1692 } 1693 1694 if (member == nullptr || U_FAILURE(*status)) 1695 { 1696 res_close(result); 1697 return nullptr; 1698 } 1699 1700 result->add(member); 1701 1702 /* eat optional comma if present */ 1703 token = peekToken(state, 0, nullptr, nullptr, nullptr, status); 1704 1705 if (token == TOK_COMMA) 1706 { 1707 getToken(state, nullptr, nullptr, nullptr, status); 1708 } 1709 1710 if (U_FAILURE(*status)) 1711 { 1712 res_close(result); 1713 return nullptr; 1714 } 1715 readToken = true; 1716 } 1717 1718 ustr_deinit(&memberComments); 1719 return result; 1720 } 1721 1722 static struct SResource * 1723 parseIntVector(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status) 1724 { 1725 enum ETokenType token; 1726 char *string; 1727 int32_t value; 1728 UBool readToken = false; 1729 char *stopstring; 1730 struct UString memberComments; 1731 1732 IntVectorResource *result = intvector_open(state->bundle, tag, comment, status); 1733 1734 if (result == nullptr || U_FAILURE(*status)) 1735 { 1736 return nullptr; 1737 } 1738 1739 if(isVerbose()){ 1740 printf(" vector %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline)); 1741 } 1742 ustr_init(&memberComments); 1743 /* '{' . string [','] '}' */ 1744 for (;;) 1745 { 1746 ustr_setlen(&memberComments, 0, status); 1747 1748 /* check for end of array, but don't consume next token unless it really is the end */ 1749 token = peekToken(state, 0, nullptr, nullptr,&memberComments, status); 1750 1751 if (token == TOK_CLOSE_BRACE) 1752 { 1753 /* it's the end, consume the close brace */ 1754 getToken(state, nullptr, nullptr, nullptr, status); 1755 if (!readToken) { 1756 warning(startline, "Encountered empty int vector"); 1757 } 1758 ustr_deinit(&memberComments); 1759 return result; 1760 } 1761 1762 int32_t stringLength; 1763 string = getInvariantString(state, nullptr, nullptr, stringLength, status); 1764 1765 if (U_FAILURE(*status)) 1766 { 1767 res_close(result); 1768 return nullptr; 1769 } 1770 1771 /* For handling illegal char in the Intvector */ 1772 value = uprv_strtoul(string, &stopstring, 0);/* make intvector support decimal,hexdigit,octal digit ranging from -2^31-2^32-1*/ 1773 int32_t len = static_cast<int32_t>(stopstring - string); 1774 1775 if(len==stringLength) 1776 { 1777 result->add(value, *status); 1778 uprv_free(string); 1779 token = peekToken(state, 0, nullptr, nullptr, nullptr, status); 1780 } 1781 else 1782 { 1783 uprv_free(string); 1784 *status=U_INVALID_CHAR_FOUND; 1785 } 1786 1787 if (U_FAILURE(*status)) 1788 { 1789 res_close(result); 1790 return nullptr; 1791 } 1792 1793 /* the comma is optional (even though it is required to prevent the reader from concatenating 1794 consecutive entries) so that a missing comma on the last entry isn't an error */ 1795 if (token == TOK_COMMA) 1796 { 1797 getToken(state, nullptr, nullptr, nullptr, status); 1798 } 1799 readToken = true; 1800 } 1801 1802 /* not reached */ 1803 /* A compiler warning will appear if all paths don't contain a return statement. */ 1804 /* intvector_close(result, status); 1805 *status = U_INTERNAL_PROGRAM_ERROR; 1806 return nullptr;*/ 1807 } 1808 1809 static struct SResource * 1810 parseBinary(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status) 1811 { 1812 uint32_t line; 1813 int32_t stringLength; 1814 LocalMemory<char> string(getInvariantString(state, &line, nullptr, stringLength, status)); 1815 if (string.isNull() || U_FAILURE(*status)) 1816 { 1817 return nullptr; 1818 } 1819 1820 expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status); 1821 if (U_FAILURE(*status)) 1822 { 1823 return nullptr; 1824 } 1825 1826 if(isVerbose()){ 1827 printf(" binary %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline)); 1828 } 1829 1830 LocalMemory<uint8_t> value; 1831 int32_t count = 0; 1832 if (stringLength > 0 && value.allocateInsteadAndCopy(stringLength) == nullptr) 1833 { 1834 *status = U_MEMORY_ALLOCATION_ERROR; 1835 return nullptr; 1836 } 1837 1838 char toConv[3] = {'\0', '\0', '\0'}; 1839 for (int32_t i = 0; i < stringLength;) 1840 { 1841 // Skip spaces (which may have been line endings). 1842 char c0 = string[i++]; 1843 if (c0 == ' ') { continue; } 1844 if (i == stringLength) { 1845 *status=U_INVALID_CHAR_FOUND; 1846 error(line, "Encountered invalid binary value (odd number of hex digits)"); 1847 return nullptr; 1848 } 1849 toConv[0] = c0; 1850 toConv[1] = string[i++]; 1851 1852 char *stopstring; 1853 value[count++] = static_cast<uint8_t>(uprv_strtoul(toConv, &stopstring, 16)); 1854 uint32_t len = static_cast<uint32_t>(stopstring - toConv); 1855 1856 if(len!=2) 1857 { 1858 *status=U_INVALID_CHAR_FOUND; 1859 error(line, "Encountered invalid binary value (not all pairs of hex digits)"); 1860 return nullptr; 1861 } 1862 } 1863 1864 if (count == 0) { 1865 warning(startline, "Encountered empty binary value"); 1866 return bin_open(state->bundle, tag, 0, nullptr, "", comment, status); 1867 } else { 1868 return bin_open(state->bundle, tag, count, value.getAlias(), nullptr, comment, status); 1869 } 1870 } 1871 1872 static struct SResource * 1873 parseInteger(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status) 1874 { 1875 struct SResource *result = nullptr; 1876 int32_t value; 1877 char *string; 1878 char *stopstring; 1879 1880 int32_t stringLength; 1881 string = getInvariantString(state, nullptr, nullptr, stringLength, status); 1882 1883 if (string == nullptr || U_FAILURE(*status)) 1884 { 1885 return nullptr; 1886 } 1887 1888 expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status); 1889 1890 if (U_FAILURE(*status)) 1891 { 1892 uprv_free(string); 1893 return nullptr; 1894 } 1895 1896 if(isVerbose()){ 1897 printf(" integer %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline)); 1898 } 1899 1900 if (stringLength == 0) 1901 { 1902 warning(startline, "Encountered empty integer. Default value is 0."); 1903 } 1904 1905 /* Allow integer support for hexdecimal, octal digit and decimal*/ 1906 /* and handle illegal char in the integer*/ 1907 value = uprv_strtoul(string, &stopstring, 0); 1908 int32_t len = static_cast<int32_t>(stopstring - string); 1909 if(len==stringLength) 1910 { 1911 result = int_open(state->bundle, tag, value, comment, status); 1912 } 1913 else 1914 { 1915 *status=U_INVALID_CHAR_FOUND; 1916 } 1917 uprv_free(string); 1918 1919 return result; 1920 } 1921 1922 static struct SResource * 1923 parseImport(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status) 1924 { 1925 uint32_t line; 1926 int32_t stringLength; 1927 LocalMemory<char> filename(getInvariantString(state, &line, nullptr, stringLength, status)); 1928 if (U_FAILURE(*status)) 1929 { 1930 return nullptr; 1931 } 1932 1933 expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status); 1934 1935 if (U_FAILURE(*status)) 1936 { 1937 return nullptr; 1938 } 1939 1940 if(isVerbose()){ 1941 printf(" import %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline)); 1942 } 1943 1944 /* Open the input file for reading */ 1945 CharString fullname; 1946 if (state->inputdir != nullptr) { 1947 fullname.append(state->inputdir, *status); 1948 } 1949 fullname.appendPathPart(filename.getAlias(), *status); 1950 if (U_FAILURE(*status)) { 1951 return nullptr; 1952 } 1953 1954 FileStream *file = T_FileStream_open(fullname.data(), "rb"); 1955 if (file == nullptr) 1956 { 1957 error(line, "couldn't open input file %s", filename.getAlias()); 1958 *status = U_FILE_ACCESS_ERROR; 1959 return nullptr; 1960 } 1961 1962 int32_t len = T_FileStream_size(file); 1963 LocalMemory<uint8_t> data; 1964 if(data.allocateInsteadAndCopy(len) == nullptr) 1965 { 1966 *status = U_MEMORY_ALLOCATION_ERROR; 1967 T_FileStream_close (file); 1968 return nullptr; 1969 } 1970 1971 /* int32_t numRead = */ T_FileStream_read(file, data.getAlias(), len); 1972 T_FileStream_close (file); 1973 1974 return bin_open(state->bundle, tag, len, data.getAlias(), fullname.data(), comment, status); 1975 } 1976 1977 static struct SResource * 1978 parseInclude(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status) 1979 { 1980 struct SResource *result; 1981 int32_t len=0; 1982 char *filename; 1983 uint32_t line; 1984 char16_t *pTarget = nullptr; 1985 1986 UCHARBUF *ucbuf; 1987 char *fullname = nullptr; 1988 const char* cp = nullptr; 1989 const char16_t* uBuffer = nullptr; 1990 1991 int32_t stringLength; 1992 filename = getInvariantString(state, &line, nullptr, stringLength, status); 1993 1994 if (U_FAILURE(*status)) 1995 { 1996 return nullptr; 1997 } 1998 1999 expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status); 2000 2001 if (U_FAILURE(*status)) 2002 { 2003 uprv_free(filename); 2004 return nullptr; 2005 } 2006 2007 if(isVerbose()){ 2008 printf(" include %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline)); 2009 } 2010 2011 fullname = static_cast<char*>(uprv_malloc(state->inputdirLength + stringLength + 2)); 2012 /* test for nullptr */ 2013 if(fullname == nullptr) 2014 { 2015 *status = U_MEMORY_ALLOCATION_ERROR; 2016 uprv_free(filename); 2017 return nullptr; 2018 } 2019 2020 if(state->inputdir!=nullptr){ 2021 if (state->inputdir[state->inputdirLength - 1] != U_FILE_SEP_CHAR) 2022 { 2023 2024 uprv_strcpy(fullname, state->inputdir); 2025 2026 fullname[state->inputdirLength] = U_FILE_SEP_CHAR; 2027 fullname[state->inputdirLength + 1] = '\0'; 2028 2029 uprv_strcat(fullname, filename); 2030 } 2031 else 2032 { 2033 uprv_strcpy(fullname, state->inputdir); 2034 uprv_strcat(fullname, filename); 2035 } 2036 }else{ 2037 uprv_strcpy(fullname,filename); 2038 } 2039 2040 ucbuf = ucbuf_open(fullname, &cp,getShowWarning(),false,status); 2041 2042 if (U_FAILURE(*status)) { 2043 error(line, "couldn't open input file %s\n", filename); 2044 return nullptr; 2045 } 2046 2047 uBuffer = ucbuf_getBuffer(ucbuf,&len,status); 2048 result = string_open(state->bundle, tag, uBuffer, len, comment, status); 2049 2050 ucbuf_close(ucbuf); 2051 2052 uprv_free(pTarget); 2053 2054 uprv_free(filename); 2055 uprv_free(fullname); 2056 2057 return result; 2058 } 2059 2060 2061 2062 2063 2064 U_STRING_DECL(k_type_string, "string", 6); 2065 U_STRING_DECL(k_type_binary, "binary", 6); 2066 U_STRING_DECL(k_type_bin, "bin", 3); 2067 U_STRING_DECL(k_type_table, "table", 5); 2068 U_STRING_DECL(k_type_table_no_fallback, "table(nofallback)", 17); 2069 U_STRING_DECL(k_type_int, "int", 3); 2070 U_STRING_DECL(k_type_integer, "integer", 7); 2071 U_STRING_DECL(k_type_array, "array", 5); 2072 U_STRING_DECL(k_type_alias, "alias", 5); 2073 U_STRING_DECL(k_type_intvector, "intvector", 9); 2074 U_STRING_DECL(k_type_import, "import", 6); 2075 U_STRING_DECL(k_type_include, "include", 7); 2076 2077 /* Various non-standard processing plugins that create one or more special resources. */ 2078 U_STRING_DECL(k_type_plugin_uca_rules, "process(uca_rules)", 18); 2079 U_STRING_DECL(k_type_plugin_collation, "process(collation)", 18); 2080 U_STRING_DECL(k_type_plugin_transliterator, "process(transliterator)", 23); 2081 U_STRING_DECL(k_type_plugin_dependency, "process(dependency)", 19); 2082 2083 typedef enum EResourceType 2084 { 2085 RESTYPE_UNKNOWN, 2086 RESTYPE_STRING, 2087 RESTYPE_BINARY, 2088 RESTYPE_TABLE, 2089 RESTYPE_TABLE_NO_FALLBACK, 2090 RESTYPE_INTEGER, 2091 RESTYPE_ARRAY, 2092 RESTYPE_ALIAS, 2093 RESTYPE_INTVECTOR, 2094 RESTYPE_IMPORT, 2095 RESTYPE_INCLUDE, 2096 RESTYPE_PROCESS_UCA_RULES, 2097 RESTYPE_PROCESS_COLLATION, 2098 RESTYPE_PROCESS_TRANSLITERATOR, 2099 RESTYPE_PROCESS_DEPENDENCY, 2100 RESTYPE_RESERVED 2101 } EResourceType; 2102 2103 static struct { 2104 const char *nameChars; /* only used for debugging */ 2105 const char16_t *nameUChars; 2106 ParseResourceFunction *parseFunction; 2107 } gResourceTypes[] = { 2108 {"Unknown", nullptr, nullptr}, 2109 {"string", k_type_string, parseString}, 2110 {"binary", k_type_binary, parseBinary}, 2111 {"table", k_type_table, parseTable}, 2112 {"table(nofallback)", k_type_table_no_fallback, nullptr}, /* parseFunction will never be called */ 2113 {"integer", k_type_integer, parseInteger}, 2114 {"array", k_type_array, parseArray}, 2115 {"alias", k_type_alias, parseAlias}, 2116 {"intvector", k_type_intvector, parseIntVector}, 2117 {"import", k_type_import, parseImport}, 2118 {"include", k_type_include, parseInclude}, 2119 {"process(uca_rules)", k_type_plugin_uca_rules, parseUCARules}, 2120 {"process(collation)", k_type_plugin_collation, nullptr /* not implemented yet */}, 2121 {"process(transliterator)", k_type_plugin_transliterator, parseTransliterator}, 2122 {"process(dependency)", k_type_plugin_dependency, parseDependency}, 2123 {"reserved", nullptr, nullptr} 2124 }; 2125 2126 void initParser() 2127 { 2128 U_STRING_INIT(k_type_string, "string", 6); 2129 U_STRING_INIT(k_type_binary, "binary", 6); 2130 U_STRING_INIT(k_type_bin, "bin", 3); 2131 U_STRING_INIT(k_type_table, "table", 5); 2132 U_STRING_INIT(k_type_table_no_fallback, "table(nofallback)", 17); 2133 U_STRING_INIT(k_type_int, "int", 3); 2134 U_STRING_INIT(k_type_integer, "integer", 7); 2135 U_STRING_INIT(k_type_array, "array", 5); 2136 U_STRING_INIT(k_type_alias, "alias", 5); 2137 U_STRING_INIT(k_type_intvector, "intvector", 9); 2138 U_STRING_INIT(k_type_import, "import", 6); 2139 U_STRING_INIT(k_type_include, "include", 7); 2140 2141 U_STRING_INIT(k_type_plugin_uca_rules, "process(uca_rules)", 18); 2142 U_STRING_INIT(k_type_plugin_collation, "process(collation)", 18); 2143 U_STRING_INIT(k_type_plugin_transliterator, "process(transliterator)", 23); 2144 U_STRING_INIT(k_type_plugin_dependency, "process(dependency)", 19); 2145 } 2146 2147 static inline UBool isTable(enum EResourceType type) { 2148 return type == RESTYPE_TABLE || type == RESTYPE_TABLE_NO_FALLBACK; 2149 } 2150 2151 static enum EResourceType 2152 parseResourceType(ParseState* state, UErrorCode *status) 2153 { 2154 struct UString *tokenValue; 2155 struct UString comment; 2156 enum EResourceType result = RESTYPE_UNKNOWN; 2157 uint32_t line=0; 2158 ustr_init(&comment); 2159 expect(state, TOK_STRING, &tokenValue, &comment, &line, status); 2160 2161 if (U_FAILURE(*status)) 2162 { 2163 return RESTYPE_UNKNOWN; 2164 } 2165 2166 *status = U_ZERO_ERROR; 2167 2168 /* Search for normal types */ 2169 result=RESTYPE_UNKNOWN; 2170 while ((result = static_cast<EResourceType>(result + 1)) < RESTYPE_RESERVED) { 2171 if (u_strcmp(tokenValue->fChars, gResourceTypes[result].nameUChars) == 0) { 2172 break; 2173 } 2174 } 2175 /* Now search for the aliases */ 2176 if (u_strcmp(tokenValue->fChars, k_type_int) == 0) { 2177 result = RESTYPE_INTEGER; 2178 } 2179 else if (u_strcmp(tokenValue->fChars, k_type_bin) == 0) { 2180 result = RESTYPE_BINARY; 2181 } 2182 else if (result == RESTYPE_RESERVED) { 2183 char tokenBuffer[1024]; 2184 u_austrncpy(tokenBuffer, tokenValue->fChars, sizeof(tokenBuffer)); 2185 tokenBuffer[sizeof(tokenBuffer) - 1] = 0; 2186 *status = U_INVALID_FORMAT_ERROR; 2187 error(line, "unknown resource type '%s'", tokenBuffer); 2188 } 2189 2190 return result; 2191 } 2192 2193 /* parse a non-top-level resource */ 2194 static struct SResource * 2195 parseResource(ParseState* state, char *tag, const struct UString *comment, UErrorCode *status) 2196 { 2197 enum ETokenType token; 2198 enum EResourceType resType = RESTYPE_UNKNOWN; 2199 ParseResourceFunction *parseFunction = nullptr; 2200 struct UString *tokenValue; 2201 uint32_t startline; 2202 uint32_t line; 2203 2204 2205 token = getToken(state, &tokenValue, nullptr, &startline, status); 2206 2207 if(isVerbose()){ 2208 printf(" resource %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline)); 2209 } 2210 2211 /* name . [ ':' type ] '{' resource '}' */ 2212 /* This function parses from the colon onwards. If the colon is present, parse the 2213 type then try to parse a resource of that type. If there is no explicit type, 2214 work it out using the lookahead tokens. */ 2215 switch (token) 2216 { 2217 case TOK_EOF: 2218 *status = U_INVALID_FORMAT_ERROR; 2219 error(startline, "Unexpected EOF encountered"); 2220 return nullptr; 2221 2222 case TOK_ERROR: 2223 *status = U_INVALID_FORMAT_ERROR; 2224 return nullptr; 2225 2226 case TOK_COLON: 2227 resType = parseResourceType(state, status); 2228 expect(state, TOK_OPEN_BRACE, &tokenValue, nullptr, &startline, status); 2229 2230 if (U_FAILURE(*status)) 2231 { 2232 return nullptr; 2233 } 2234 2235 break; 2236 2237 case TOK_OPEN_BRACE: 2238 break; 2239 2240 default: 2241 *status = U_INVALID_FORMAT_ERROR; 2242 error(startline, "syntax error while reading a resource, expected '{' or ':'"); 2243 return nullptr; 2244 } 2245 2246 2247 if (resType == RESTYPE_UNKNOWN) 2248 { 2249 /* No explicit type, so try to work it out. At this point, we've read the first '{'. 2250 We could have any of the following: 2251 { { => array (nested) 2252 { :/} => array 2253 { string , => string array 2254 2255 { string { => table 2256 2257 { string :/{ => table 2258 { string } => string 2259 */ 2260 2261 token = peekToken(state, 0, nullptr, &line, nullptr,status); 2262 2263 if (U_FAILURE(*status)) 2264 { 2265 return nullptr; 2266 } 2267 2268 if (token == TOK_OPEN_BRACE || token == TOK_COLON ||token ==TOK_CLOSE_BRACE ) 2269 { 2270 resType = RESTYPE_ARRAY; 2271 } 2272 else if (token == TOK_STRING) 2273 { 2274 token = peekToken(state, 1, nullptr, &line, nullptr, status); 2275 2276 if (U_FAILURE(*status)) 2277 { 2278 return nullptr; 2279 } 2280 2281 switch (token) 2282 { 2283 case TOK_COMMA: resType = RESTYPE_ARRAY; break; 2284 case TOK_OPEN_BRACE: resType = RESTYPE_TABLE; break; 2285 case TOK_CLOSE_BRACE: resType = RESTYPE_STRING; break; 2286 case TOK_COLON: resType = RESTYPE_TABLE; break; 2287 default: 2288 *status = U_INVALID_FORMAT_ERROR; 2289 error(line, "Unexpected token after string, expected ',', '{' or '}'"); 2290 return nullptr; 2291 } 2292 } 2293 else 2294 { 2295 *status = U_INVALID_FORMAT_ERROR; 2296 error(line, "Unexpected token after '{'"); 2297 return nullptr; 2298 } 2299 2300 /* printf("Type guessed as %s\n", resourceNames[resType]); */ 2301 } else if(resType == RESTYPE_TABLE_NO_FALLBACK) { 2302 *status = U_INVALID_FORMAT_ERROR; 2303 error(startline, "error: %s resource type not valid except on top bundle level", gResourceTypes[resType].nameChars); 2304 return nullptr; 2305 } 2306 2307 2308 /* We should now know what we need to parse next, so call the appropriate parser 2309 function and return. */ 2310 parseFunction = gResourceTypes[resType].parseFunction; 2311 if (parseFunction != nullptr) { 2312 return parseFunction(state, tag, startline, comment, status); 2313 } 2314 else { 2315 *status = U_INTERNAL_PROGRAM_ERROR; 2316 error(startline, "internal error: %s resource type found and not handled", gResourceTypes[resType].nameChars); 2317 } 2318 2319 return nullptr; 2320 } 2321 2322 /* parse the top-level resource */ 2323 struct SRBRoot * 2324 parse(UCHARBUF *buf, const char *inputDir, const char *outputDir, const char *filename, 2325 UBool makeBinaryCollation, UBool omitCollationRules, UBool icu4xMode, UErrorCode *status) 2326 { 2327 struct UString *tokenValue; 2328 struct UString comment; 2329 uint32_t line; 2330 enum EResourceType bundleType; 2331 enum ETokenType token; 2332 ParseState state; 2333 uint32_t i; 2334 2335 2336 for (i = 0; i < MAX_LOOKAHEAD + 1; i++) 2337 { 2338 ustr_init(&state.lookahead[i].value); 2339 ustr_init(&state.lookahead[i].comment); 2340 } 2341 2342 initLookahead(&state, buf, status); 2343 2344 state.inputdir = inputDir; 2345 state.inputdirLength = state.inputdir != nullptr ? static_cast<uint32_t>(uprv_strlen(state.inputdir)) : 0; 2346 state.outputdir = outputDir; 2347 state.outputdirLength = state.outputdir != nullptr ? static_cast<uint32_t>(uprv_strlen(state.outputdir)) : 0; 2348 state.filename = filename; 2349 state.makeBinaryCollation = makeBinaryCollation; 2350 state.omitCollationRules = omitCollationRules; 2351 state.icu4xMode = icu4xMode; 2352 2353 ustr_init(&comment); 2354 expect(&state, TOK_STRING, &tokenValue, &comment, nullptr, status); 2355 2356 state.bundle = new SRBRoot(&comment, false, *status); 2357 2358 if (state.bundle == nullptr || U_FAILURE(*status)) 2359 { 2360 delete state.bundle; 2361 2362 return nullptr; 2363 } 2364 2365 2366 state.bundle->setLocale(tokenValue->fChars, *status); 2367 2368 /* The following code is to make Empty bundle work no matter with :table specifer or not */ 2369 token = getToken(&state, nullptr, nullptr, &line, status); 2370 if(token==TOK_COLON) { 2371 *status=U_ZERO_ERROR; 2372 bundleType=parseResourceType(&state, status); 2373 2374 if(isTable(bundleType)) 2375 { 2376 expect(&state, TOK_OPEN_BRACE, nullptr, nullptr, &line, status); 2377 } 2378 else 2379 { 2380 *status=U_PARSE_ERROR; 2381 error(line, "parse error. Stopped parsing with %s", u_errorName(*status)); 2382 } 2383 } 2384 else 2385 { 2386 /* not a colon */ 2387 if(token==TOK_OPEN_BRACE) 2388 { 2389 *status=U_ZERO_ERROR; 2390 bundleType=RESTYPE_TABLE; 2391 } 2392 else 2393 { 2394 /* neither colon nor open brace */ 2395 *status=U_PARSE_ERROR; 2396 bundleType=RESTYPE_UNKNOWN; 2397 error(line, "parse error, did not find open-brace '{' or colon ':', stopped with %s", u_errorName(*status)); 2398 } 2399 } 2400 2401 if (U_FAILURE(*status)) 2402 { 2403 delete state.bundle; 2404 return nullptr; 2405 } 2406 2407 if(bundleType==RESTYPE_TABLE_NO_FALLBACK) { 2408 /* 2409 * Parse a top-level table with the table(nofallback) declaration. 2410 * This is the same as a regular table, but also sets the 2411 * URES_ATT_NO_FALLBACK flag in indexes[URES_INDEX_ATTRIBUTES] . 2412 */ 2413 state.bundle->fNoFallback=true; 2414 } 2415 /* top-level tables need not handle special table names like "collations" */ 2416 assert(!state.bundle->fIsPoolBundle); 2417 assert(state.bundle->fRoot->fType == URES_TABLE); 2418 TableResource *rootTable = static_cast<TableResource *>(state.bundle->fRoot); 2419 realParseTable(&state, rootTable, nullptr, line, status); 2420 if(dependencyArray!=nullptr){ 2421 rootTable->add(dependencyArray, 0, *status); 2422 dependencyArray = nullptr; 2423 } 2424 if (U_FAILURE(*status)) 2425 { 2426 delete state.bundle; 2427 res_close(dependencyArray); 2428 return nullptr; 2429 } 2430 2431 if (getToken(&state, nullptr, nullptr, &line, status) != TOK_EOF) 2432 { 2433 warning(line, "extraneous text after resource bundle (perhaps unmatched braces)"); 2434 if(isStrict()){ 2435 *status = U_INVALID_FORMAT_ERROR; 2436 return nullptr; 2437 } 2438 } 2439 2440 cleanupLookahead(&state); 2441 ustr_deinit(&comment); 2442 return state.bundle; 2443 }