tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

parse.cpp (80186B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 1998-2015, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *
     11 * File parse.cpp
     12 *
     13 * Modification History:
     14 *
     15 *   Date          Name          Description
     16 *   05/26/99     stephen       Creation.
     17 *   02/25/00     weiv          Overhaul to write udata
     18 *   5/10/01      Ram           removed ustdio dependency
     19 *   06/10/2001  Dominic Ludlam <dom@recoil.org> Rewritten
     20 *******************************************************************************
     21 */
     22 
     23 // Safer use of UnicodeString.
     24 #include <cstdint>
     25 #include "unicode/umachine.h"
     26 #ifndef UNISTR_FROM_CHAR_EXPLICIT
     27 #   define UNISTR_FROM_CHAR_EXPLICIT explicit
     28 #endif
     29 
     30 // Less important, but still a good idea.
     31 #ifndef UNISTR_FROM_STRING_EXPLICIT
     32 #   define UNISTR_FROM_STRING_EXPLICIT explicit
     33 #endif
     34 
     35 #include <assert.h>
     36 #include "parse.h"
     37 #include "errmsg.h"
     38 #include "uhash.h"
     39 #include "cmemory.h"
     40 #include "cstring.h"
     41 #include "uinvchar.h"
     42 #include "read.h"
     43 #include "ustr.h"
     44 #include "reslist.h"
     45 #include "rbt_pars.h"
     46 #include "genrb.h"
     47 #include "unicode/normalizer2.h"
     48 #include "unicode/stringpiece.h"
     49 #include "unicode/unistr.h"
     50 #include "unicode/ustring.h"
     51 #include "unicode/uscript.h"
     52 #include "unicode/utf16.h"
     53 #include "unicode/putil.h"
     54 #include "charstr.h"
     55 #include "collationbuilder.h"
     56 #include "collationdata.h"
     57 #include "collationdatareader.h"
     58 #include "collationdatawriter.h"
     59 #include "collationfastlatinbuilder.h"
     60 #include "collationinfo.h"
     61 #include "collationroot.h"
     62 #include "collationruleparser.h"
     63 #include "collationtailoring.h"
     64 #include <stdio.h>
     65 #include "writesrc.h"
     66 
     67 /* Number of tokens to read ahead of the current stream position */
     68 #define MAX_LOOKAHEAD   3
     69 
     70 #define CR               0x000D
     71 #define LF               0x000A
     72 #define SPACE            0x0020
     73 #define TAB              0x0009
     74 #define ESCAPE           0x005C
     75 #define HASH             0x0023
     76 #define QUOTE            0x0027
     77 #define ZERO             0x0030
     78 #define STARTCOMMAND     0x005B
     79 #define ENDCOMMAND       0x005D
     80 #define OPENSQBRACKET    0x005B
     81 #define CLOSESQBRACKET   0x005D
     82 
     83 #define ICU4X_DIACRITIC_BASE  0x0300
     84 #define ICU4X_DIACRITIC_LIMIT 0x034F
     85 
     86 using icu::CharString;
     87 using icu::LocalMemory;
     88 using icu::LocalPointer;
     89 using icu::LocalUCHARBUFPointer;
     90 using icu::StringPiece;
     91 using icu::UnicodeString;
     92 
     93 struct Lookahead
     94 {
     95     enum   ETokenType type;
     96     struct UString    value;
     97     struct UString    comment;
     98     uint32_t          line;
     99 };
    100 
    101 /* keep in sync with token defines in read.h */
    102 const char *tokenNames[TOK_TOKEN_COUNT] =
    103 {
    104     "string",             /* A string token, such as "MonthNames" */
    105     "'{'",                 /* An opening brace character */
    106     "'}'",                 /* A closing brace character */
    107     "','",                 /* A comma */
    108     "':'",                 /* A colon */
    109 
    110     "<end of file>",     /* End of the file has been reached successfully */
    111     "<end of line>"
    112 };
    113 
    114 /* Just to store "TRUE" */
    115 //static const char16_t trueValue[] = {0x0054, 0x0052, 0x0055, 0x0045, 0x0000};
    116 
    117 typedef struct {
    118    struct Lookahead  lookahead[MAX_LOOKAHEAD + 1];
    119    uint32_t          lookaheadPosition;
    120    UCHARBUF         *buffer;
    121    struct SRBRoot *bundle;
    122    const char     *inputdir;
    123    uint32_t        inputdirLength;
    124    const char     *outputdir;
    125    uint32_t        outputdirLength;
    126    const char     *filename;
    127    UBool           makeBinaryCollation;
    128    UBool           omitCollationRules;
    129    UBool           icu4xMode;
    130 } ParseState;
    131 
    132 typedef struct SResource *
    133 ParseResourceFunction(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status);
    134 
    135 static struct SResource *parseResource(ParseState* state, char *tag, const struct UString *comment, UErrorCode *status);
    136 
    137 /* The nature of the lookahead buffer:
    138   There are MAX_LOOKAHEAD + 1 slots, used as a circular buffer.  This provides
    139   MAX_LOOKAHEAD lookahead tokens and a slot for the current token and value.
    140   When getToken is called, the current pointer is moved to the next slot and the
    141   old slot is filled with the next token from the reader by calling getNextToken.
    142   The token values are stored in the slot, which means that token values don't
    143   survive a call to getToken, ie.
    144 
    145   UString *value;
    146 
    147   getToken(&value, nullptr, status);
    148   getToken(nullptr,   nullptr, status);       bad - value is now a different string
    149 */
    150 static void
    151 initLookahead(ParseState* state, UCHARBUF *buf, UErrorCode *status)
    152 {
    153    static uint32_t initTypeStrings = 0;
    154    uint32_t i;
    155 
    156    if (!initTypeStrings)
    157    {
    158        initTypeStrings = 1;
    159    }
    160 
    161    state->lookaheadPosition   = 0;
    162    state->buffer              = buf;
    163 
    164    resetLineNumber();
    165 
    166    for (i = 0; i < MAX_LOOKAHEAD; i++)
    167    {
    168        state->lookahead[i].type = getNextToken(state->buffer, &state->lookahead[i].value, &state->lookahead[i].line, &state->lookahead[i].comment, status);
    169        if (U_FAILURE(*status))
    170        {
    171            return;
    172        }
    173    }
    174 
    175    *status = U_ZERO_ERROR;
    176 }
    177 
    178 static void
    179 cleanupLookahead(ParseState* state)
    180 {
    181    uint32_t i;
    182    for (i = 0; i <= MAX_LOOKAHEAD; i++)
    183    {
    184        ustr_deinit(&state->lookahead[i].value);
    185        ustr_deinit(&state->lookahead[i].comment);
    186    }
    187 
    188 }
    189 
    190 static enum ETokenType
    191 getToken(ParseState* state, struct UString **tokenValue, struct UString* comment, uint32_t *linenumber, UErrorCode *status)
    192 {
    193    enum ETokenType result;
    194    uint32_t          i;
    195 
    196    result = state->lookahead[state->lookaheadPosition].type;
    197 
    198    if (tokenValue != nullptr)
    199    {
    200        *tokenValue = &state->lookahead[state->lookaheadPosition].value;
    201    }
    202 
    203    if (linenumber != nullptr)
    204    {
    205        *linenumber = state->lookahead[state->lookaheadPosition].line;
    206    }
    207 
    208    if (comment != nullptr)
    209    {
    210        ustr_cpy(comment, &(state->lookahead[state->lookaheadPosition].comment), status);
    211    }
    212 
    213    i = (state->lookaheadPosition + MAX_LOOKAHEAD) % (MAX_LOOKAHEAD + 1);
    214    state->lookaheadPosition = (state->lookaheadPosition + 1) % (MAX_LOOKAHEAD + 1);
    215    ustr_setlen(&state->lookahead[i].comment, 0, status);
    216    ustr_setlen(&state->lookahead[i].value, 0, status);
    217    state->lookahead[i].type = getNextToken(state->buffer, &state->lookahead[i].value, &state->lookahead[i].line, &state->lookahead[i].comment, status);
    218 
    219    /* printf("getToken, returning %s\n", tokenNames[result]); */
    220 
    221    return result;
    222 }
    223 
    224 static enum ETokenType
    225 peekToken(ParseState* state, uint32_t lookaheadCount, struct UString **tokenValue, uint32_t *linenumber, struct UString *comment, UErrorCode *status)
    226 {
    227    uint32_t i = (state->lookaheadPosition + lookaheadCount) % (MAX_LOOKAHEAD + 1);
    228 
    229    if (U_FAILURE(*status))
    230    {
    231        return TOK_ERROR;
    232    }
    233 
    234    if (lookaheadCount >= MAX_LOOKAHEAD)
    235    {
    236        *status = U_INTERNAL_PROGRAM_ERROR;
    237        return TOK_ERROR;
    238    }
    239 
    240    if (tokenValue != nullptr)
    241    {
    242        *tokenValue = &state->lookahead[i].value;
    243    }
    244 
    245    if (linenumber != nullptr)
    246    {
    247        *linenumber = state->lookahead[i].line;
    248    }
    249 
    250    if(comment != nullptr){
    251        ustr_cpy(comment, &(state->lookahead[state->lookaheadPosition].comment), status);
    252    }
    253 
    254    return state->lookahead[i].type;
    255 }
    256 
    257 static void
    258 expect(ParseState* state, enum ETokenType expectedToken, struct UString **tokenValue, struct UString *comment, uint32_t *linenumber, UErrorCode *status)
    259 {
    260    uint32_t        line;
    261 
    262    enum ETokenType token = getToken(state, tokenValue, comment, &line, status);
    263 
    264    if (linenumber != nullptr)
    265    {
    266        *linenumber = line;
    267    }
    268 
    269    if (U_FAILURE(*status))
    270    {
    271        return;
    272    }
    273 
    274    if (token != expectedToken)
    275    {
    276        *status = U_INVALID_FORMAT_ERROR;
    277        error(line, "expecting %s, got %s", tokenNames[expectedToken], tokenNames[token]);
    278    }
    279    else
    280    {
    281        *status = U_ZERO_ERROR;
    282    }
    283 }
    284 
    285 static char *getInvariantString(ParseState* state, uint32_t *line, struct UString *comment,
    286                                int32_t &stringLength, UErrorCode *status)
    287 {
    288    struct UString *tokenValue;
    289    char           *result;
    290 
    291    expect(state, TOK_STRING, &tokenValue, comment, line, status);
    292 
    293    if (U_FAILURE(*status))
    294    {
    295        return nullptr;
    296    }
    297 
    298    if(!uprv_isInvariantUString(tokenValue->fChars, tokenValue->fLength)) {
    299        *status = U_INVALID_FORMAT_ERROR;
    300        error((line == nullptr) ? 0 : *line, "invariant characters required for table keys, binary data, etc.");
    301        return nullptr;
    302    }
    303 
    304    result = static_cast<char *>(uprv_malloc(tokenValue->fLength+1));
    305 
    306    if (result == nullptr)
    307    {
    308        *status = U_MEMORY_ALLOCATION_ERROR;
    309        return nullptr;
    310    }
    311 
    312    u_UCharsToChars(tokenValue->fChars, result, tokenValue->fLength+1);
    313    stringLength = tokenValue->fLength;
    314    return result;
    315 }
    316 
    317 static struct SResource *
    318 parseUCARules(ParseState* state, char *tag, uint32_t startline, const struct UString* /*comment*/, UErrorCode *status)
    319 {
    320    struct SResource *result = nullptr;
    321    struct UString   *tokenValue;
    322    FileStream       *file          = nullptr;
    323    CharString       filename;
    324    uint32_t         line;
    325    UBool quoted = false;
    326    UCHARBUF *ucbuf=nullptr;
    327    UChar32   c     = 0;
    328    const char* cp  = nullptr;
    329    char16_t *pTarget     = nullptr;
    330    char16_t *target      = nullptr;
    331    char16_t *targetLimit = nullptr;
    332    int32_t size = 0;
    333 
    334    expect(state, TOK_STRING, &tokenValue, nullptr, &line, status);
    335 
    336    if(isVerbose()){
    337        printf(" %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline));
    338    }
    339 
    340    if (U_FAILURE(*status))
    341    {
    342        return nullptr;
    343    }
    344    /* make the filename including the directory */
    345    if (state->inputdir != nullptr)
    346    {
    347        filename.append(state->inputdir, -1, *status);
    348 
    349        if (state->inputdir[state->inputdirLength - 1] != U_FILE_SEP_CHAR)
    350        {
    351            filename.append(U_FILE_SEP_CHAR, *status);
    352        }
    353    }
    354 
    355    filename.appendInvariantChars(tokenValue->fChars, tokenValue->fLength, *status);
    356 
    357    expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status);
    358 
    359    if (U_FAILURE(*status))
    360    {
    361        return nullptr;
    362    }
    363 
    364    if(state->omitCollationRules) {
    365        return res_none();
    366    }
    367 
    368    ucbuf = ucbuf_open(filename.data(), &cp, getShowWarning(),false, status);
    369 
    370    if (U_FAILURE(*status)) {
    371        error(line, "An error occurred while opening the input file %s\n", filename.data());
    372        return nullptr;
    373    }
    374 
    375    /* We allocate more space than actually required
    376    * since the actual size needed for storing UChars
    377    * is not known in UTF-8 byte stream
    378    */
    379    size        = ucbuf_size(ucbuf) + 1;
    380    pTarget = static_cast<char16_t*>(uprv_malloc(U_SIZEOF_UCHAR * size));
    381    uprv_memset(pTarget, 0, size*U_SIZEOF_UCHAR);
    382    target      = pTarget;
    383    targetLimit = pTarget+size;
    384 
    385    /* read the rules into the buffer */
    386    while (target < targetLimit)
    387    {
    388        c = ucbuf_getc(ucbuf, status);
    389        if(c == QUOTE) {
    390            quoted = static_cast<UBool>(!quoted);
    391        }
    392        /* weiv (06/26/2002): adding the following:
    393         * - preserving spaces in commands [...]
    394         * - # comments until the end of line
    395         */
    396        if (c == STARTCOMMAND && !quoted)
    397        {
    398            /* preserve commands
    399             * closing bracket will be handled by the
    400             * append at the end of the loop
    401             */
    402            while(c != ENDCOMMAND) {
    403                U_APPEND_CHAR32_ONLY(c, target);
    404                c = ucbuf_getc(ucbuf, status);
    405            }
    406        }
    407        else if (c == HASH && !quoted) {
    408            /* skip comments */
    409            while(c != CR && c != LF) {
    410                c = ucbuf_getc(ucbuf, status);
    411            }
    412            continue;
    413        }
    414        else if (c == ESCAPE)
    415        {
    416            c = unescape(ucbuf, status);
    417 
    418            if (c == static_cast<UChar32>(U_ERR))
    419            {
    420                uprv_free(pTarget);
    421                T_FileStream_close(file);
    422                return nullptr;
    423            }
    424        }
    425        else if (!quoted && (c == SPACE || c == TAB || c == CR || c == LF))
    426        {
    427            /* ignore spaces carriage returns
    428            * and line feed unless in the form \uXXXX
    429            */
    430            continue;
    431        }
    432 
    433        /* Append char16_t * after dissembling if c > 0xffff*/
    434        if (c != static_cast<UChar32>(U_EOF))
    435        {
    436            U_APPEND_CHAR32_ONLY(c, target);
    437        }
    438        else
    439        {
    440            break;
    441        }
    442    }
    443 
    444    /* terminate the string */
    445    if(target < targetLimit){
    446        *target = 0x0000;
    447    }
    448 
    449    result = string_open(state->bundle, tag, pTarget, static_cast<int32_t>(target - pTarget), nullptr, status);
    450 
    451 
    452    ucbuf_close(ucbuf);
    453    uprv_free(pTarget);
    454    T_FileStream_close(file);
    455 
    456    return result;
    457 }
    458 
    459 static struct SResource *
    460 parseTransliterator(ParseState* state, char *tag, uint32_t startline, const struct UString* /*comment*/, UErrorCode *status)
    461 {
    462    struct SResource *result = nullptr;
    463    struct UString   *tokenValue;
    464    FileStream       *file          = nullptr;
    465    char              filename[256] = { '\0' };
    466    char              cs[128]       = { '\0' };
    467    uint32_t          line;
    468    UCHARBUF *ucbuf=nullptr;
    469    const char* cp  = nullptr;
    470    char16_t *pTarget     = nullptr;
    471    const char16_t *pSource     = nullptr;
    472    int32_t size = 0;
    473 
    474    expect(state, TOK_STRING, &tokenValue, nullptr, &line, status);
    475 
    476    if(isVerbose()){
    477        printf(" %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline));
    478    }
    479 
    480    if (U_FAILURE(*status))
    481    {
    482        return nullptr;
    483    }
    484    /* make the filename including the directory */
    485    if (state->inputdir != nullptr)
    486    {
    487        uprv_strcat(filename, state->inputdir);
    488 
    489        if (state->inputdir[state->inputdirLength - 1] != U_FILE_SEP_CHAR)
    490        {
    491            uprv_strcat(filename, U_FILE_SEP_STRING);
    492        }
    493    }
    494 
    495    u_UCharsToChars(tokenValue->fChars, cs, tokenValue->fLength);
    496 
    497    expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status);
    498 
    499    if (U_FAILURE(*status))
    500    {
    501        return nullptr;
    502    }
    503    uprv_strcat(filename, cs);
    504 
    505 
    506    ucbuf = ucbuf_open(filename, &cp, getShowWarning(),false, status);
    507 
    508    if (U_FAILURE(*status)) {
    509        error(line, "An error occurred while opening the input file %s\n", filename);
    510        return nullptr;
    511    }
    512 
    513    /* We allocate more space than actually required
    514    * since the actual size needed for storing UChars
    515    * is not known in UTF-8 byte stream
    516    */
    517    pSource = ucbuf_getBuffer(ucbuf, &size, status);
    518    pTarget = static_cast<char16_t*>(uprv_malloc(U_SIZEOF_UCHAR * (size + 1)));
    519    uprv_memset(pTarget, 0, size*U_SIZEOF_UCHAR);
    520 
    521 #if !UCONFIG_NO_TRANSLITERATION
    522    size = utrans_stripRules(pSource, size, pTarget, status);
    523 #else
    524    size = 0;
    525    fprintf(stderr, " Warning: writing empty transliteration data ( UCONFIG_NO_TRANSLITERATION ) \n");
    526 #endif
    527    result = string_open(state->bundle, tag, pTarget, size, nullptr, status);
    528 
    529    ucbuf_close(ucbuf);
    530    uprv_free(pTarget);
    531    T_FileStream_close(file);
    532 
    533    return result;
    534 }
    535 static ArrayResource* dependencyArray = nullptr;
    536 
    537 static struct SResource *
    538 parseDependency(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status)
    539 {
    540    struct SResource *result = nullptr;
    541    struct SResource *elem = nullptr;
    542    struct UString   *tokenValue;
    543    uint32_t          line;
    544    char              filename[256] = { '\0' };
    545    char              cs[128]       = { '\0' };
    546 
    547    expect(state, TOK_STRING, &tokenValue, nullptr, &line, status);
    548 
    549    if(isVerbose()){
    550        printf(" %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline));
    551    }
    552 
    553    if (U_FAILURE(*status))
    554    {
    555        return nullptr;
    556    }
    557    /* make the filename including the directory */
    558    if (state->outputdir != nullptr)
    559    {
    560        uprv_strcat(filename, state->outputdir);
    561 
    562        if (state->outputdir[state->outputdirLength - 1] != U_FILE_SEP_CHAR)
    563        {
    564            uprv_strcat(filename, U_FILE_SEP_STRING);
    565        }
    566    }
    567 
    568    u_UCharsToChars(tokenValue->fChars, cs, tokenValue->fLength);
    569 
    570    if (U_FAILURE(*status))
    571    {
    572        return nullptr;
    573    }
    574    uprv_strcat(filename, cs);
    575    if(!T_FileStream_file_exists(filename)){
    576        if(isStrict()){
    577            error(line, "The dependency file %s does not exist. Please make sure it exists.\n",filename);
    578        }else{
    579            warning(line, "The dependency file %s does not exist. Please make sure it exists.\n",filename);
    580        }
    581    }
    582    if(dependencyArray==nullptr){
    583        dependencyArray = array_open(state->bundle, "%%DEPENDENCY", nullptr, status);
    584    }
    585    if(tag!=nullptr){
    586        result = string_open(state->bundle, tag, tokenValue->fChars, tokenValue->fLength, comment, status);
    587    }
    588    elem = string_open(state->bundle, nullptr, tokenValue->fChars, tokenValue->fLength, comment, status);
    589 
    590    dependencyArray->add(elem);
    591 
    592    if (U_FAILURE(*status))
    593    {
    594        return nullptr;
    595    }
    596    expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status);
    597    return result;
    598 }
    599 static struct SResource *
    600 parseString(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status)
    601 {
    602    struct UString   *tokenValue;
    603    struct SResource *result = nullptr;
    604 
    605 /*    if (tag != nullptr && uprv_strcmp(tag, "%%UCARULES") == 0)
    606    {
    607        return parseUCARules(tag, startline, status);
    608    }*/
    609    if(isVerbose()){
    610        printf(" string %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline));
    611    }
    612    expect(state, TOK_STRING, &tokenValue, nullptr, nullptr, status);
    613 
    614    if (U_SUCCESS(*status))
    615    {
    616        /* create the string now - tokenValue doesn't survive a call to getToken (and therefore
    617        doesn't survive expect either) */
    618 
    619        result = string_open(state->bundle, tag, tokenValue->fChars, tokenValue->fLength, comment, status);
    620        if(U_SUCCESS(*status) && result) {
    621            expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status);
    622 
    623            if (U_FAILURE(*status))
    624            {
    625                res_close(result);
    626                return nullptr;
    627            }
    628        }
    629    }
    630 
    631    return result;
    632 }
    633 
    634 static struct SResource *
    635 parseAlias(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status)
    636 {
    637    struct UString   *tokenValue;
    638    struct SResource *result  = nullptr;
    639 
    640    expect(state, TOK_STRING, &tokenValue, nullptr, nullptr, status);
    641 
    642    if(isVerbose()){
    643        printf(" alias %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline));
    644    }
    645 
    646    if (U_SUCCESS(*status))
    647    {
    648        /* create the string now - tokenValue doesn't survive a call to getToken (and therefore
    649        doesn't survive expect either) */
    650 
    651        result = alias_open(state->bundle, tag, tokenValue->fChars, tokenValue->fLength, comment, status);
    652 
    653        expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status);
    654 
    655        if (U_FAILURE(*status))
    656        {
    657            res_close(result);
    658            return nullptr;
    659        }
    660    }
    661 
    662    return result;
    663 }
    664 
    665 #if !UCONFIG_NO_COLLATION
    666 
    667 namespace {
    668 
    669 struct SResource* resLookup(struct SResource* res, const char* key) {
    670    if (res == res_none() || !res->isTable()) {
    671        return nullptr;
    672    }
    673 
    674    TableResource *list = static_cast<TableResource *>(res);
    675    SResource *current = list->fFirst;
    676    while (current != nullptr) {
    677        if (uprv_strcmp(((list->fRoot->fKeys) + (current->fKey)), key) == 0) {
    678            return current;
    679        }
    680        current = current->fNext;
    681    }
    682    return nullptr;
    683 }
    684 
    685 class GenrbImporter : public icu::CollationRuleParser::Importer {
    686 public:
    687    GenrbImporter(const char *in, const char *out) : inputDir(in), outputDir(out) {}
    688    virtual ~GenrbImporter();
    689    virtual void getRules(
    690            const char *localeID, const char *collationType,
    691            UnicodeString &rules,
    692            const char *&errorReason, UErrorCode &errorCode) override;
    693 
    694 private:
    695    const char *inputDir;
    696    const char *outputDir;
    697 };
    698 
    699 GenrbImporter::~GenrbImporter() {}
    700 
    701 void
    702 GenrbImporter::getRules(
    703        const char *localeID, const char *collationType,
    704        UnicodeString &rules,
    705        const char *& /*errorReason*/, UErrorCode &errorCode) {
    706    CharString filename(localeID, errorCode);
    707    for(int32_t i = 0; i < filename.length(); i++){
    708        if(filename[i] == '-'){
    709            filename.data()[i] = '_';
    710        }
    711    }
    712    filename.append(".txt", errorCode);
    713    if (U_FAILURE(errorCode)) {
    714        return;
    715    }
    716    CharString inputDirBuf;
    717    CharString openFileName;
    718    if(inputDir == nullptr) {
    719        const char *filenameBegin = uprv_strrchr(filename.data(), U_FILE_SEP_CHAR);
    720        if (filenameBegin != nullptr) {
    721            /*
    722             * When a filename ../../../data/root.txt is specified,
    723             * we presume that the input directory is ../../../data
    724             * This is very important when the resource file includes
    725             * another file, like UCARules.txt or thaidict.brk.
    726             */
    727            StringPiece dir = filename.toStringPiece();
    728            const char *filenameLimit = filename.data() + filename.length();
    729            dir.remove_suffix(static_cast<int32_t>(filenameLimit - filenameBegin));
    730            inputDirBuf.append(dir, errorCode);
    731            inputDir = inputDirBuf.data();
    732        }
    733    }else{
    734        int32_t dirlen = static_cast<int32_t>(uprv_strlen(inputDir));
    735 
    736        if((filename[0] != U_FILE_SEP_CHAR) && (inputDir[dirlen-1] !='.')) {
    737            /*
    738             * append the input dir to openFileName if the first char in
    739             * filename is not file separator char and the last char input directory is  not '.'.
    740             * This is to support :
    741             * genrb -s. /home/icu/data
    742             * genrb -s. icu/data
    743             * The user cannot mix notations like
    744             * genrb -s. /icu/data --- the absolute path specified. -s redundant
    745             * user should use
    746             * genrb -s. icu/data  --- start from CWD and look in icu/data dir
    747             */
    748            openFileName.append(inputDir, dirlen, errorCode);
    749            if(inputDir[dirlen-1] != U_FILE_SEP_CHAR) {
    750                openFileName.append(U_FILE_SEP_CHAR, errorCode);
    751            }
    752        }
    753    }
    754    openFileName.append(filename, errorCode);
    755    if(U_FAILURE(errorCode)) {
    756        return;
    757    }
    758    // printf("GenrbImporter::getRules(%s, %s) reads %s\n", localeID, collationType, openFileName.data());
    759    const char* cp = "";
    760    LocalUCHARBUFPointer ucbuf(
    761            ucbuf_open(openFileName.data(), &cp, getShowWarning(), true, &errorCode));
    762    if(errorCode == U_FILE_ACCESS_ERROR) {
    763        fprintf(stderr, "couldn't open file %s\n", openFileName.data());
    764        return;
    765    }
    766    if (ucbuf.isNull() || U_FAILURE(errorCode)) {
    767        fprintf(stderr, "An error occurred processing file %s. Error: %s\n", openFileName.data(), u_errorName(errorCode));
    768        return;
    769    }
    770 
    771    /* Parse the data into an SRBRoot */
    772    LocalPointer<SRBRoot> data(
    773            parse(ucbuf.getAlias(), inputDir, outputDir, filename.data(), false, false, false, &errorCode));
    774    if (U_FAILURE(errorCode)) {
    775        return;
    776    }
    777 
    778    struct SResource *root = data->fRoot;
    779    struct SResource *collations = resLookup(root, "collations");
    780    if (collations != nullptr) {
    781      struct SResource *collation = resLookup(collations, collationType);
    782      if (collation != nullptr) {
    783        struct SResource *sequence = resLookup(collation, "Sequence");
    784        if (sequence != nullptr && sequence->isString()) {
    785          // No string pointer aliasing so that we need not hold onto the resource bundle.
    786          StringResource *sr = static_cast<StringResource *>(sequence);
    787          rules = sr->fString;
    788        }
    789      }
    790    }
    791 }
    792 
    793 // Quick-and-dirty escaping function.
    794 // Assumes that we are on an ASCII-based platform.
    795 void
    796 escape(const char16_t *s, char *buffer, size_t n) {
    797    int32_t length = u_strlen(s);
    798    int32_t i = 0;
    799    for (;;) {
    800        UChar32 c;
    801        U16_NEXT(s, i, length, c);
    802        if (c == 0) {
    803            *buffer = 0;
    804            return;
    805        } else if (0x20 <= c && c <= 0x7e) {
    806            // printable ASCII
    807            *buffer++ = static_cast<char>(c); // assumes ASCII-based platform
    808        } else {
    809            buffer += snprintf(buffer, n, "\\u%04X", static_cast<int>(c));
    810        }
    811    }
    812 }
    813 
    814 }  // namespace
    815 
    816 static FILE*
    817 openTOML(const char* outputdir, const char* name, const char* collationType, const char* structType, UErrorCode *status) {
    818    CharString baseName;
    819    baseName.append(name, *status);
    820    baseName.append("_", *status);
    821    baseName.append(collationType, *status);
    822    baseName.append("_", *status);
    823    baseName.append(structType, *status);
    824 
    825    CharString outFileName;
    826    if (outputdir && *outputdir) {
    827        outFileName.append(outputdir, *status).ensureEndsWithFileSeparator(*status);
    828    }
    829    outFileName.append(baseName, *status);
    830    outFileName.append(".toml", *status);
    831    if (U_FAILURE(*status)) {
    832        return nullptr;
    833    }
    834 
    835    FILE* f = fopen(outFileName.data(), "w");
    836    if (!f) {
    837        *status = U_FILE_ACCESS_ERROR;
    838        return nullptr;
    839    }
    840    usrc_writeFileNameGeneratedBy(f, "#", baseName.data(), "genrb -X");
    841 
    842    return f;
    843 }
    844 
    845 static void
    846 writeCollationMetadataTOML(const char* outputdir, const char* name, const char* collationType, const uint32_t metadataBits, UErrorCode *status) {
    847    FILE* f = openTOML(outputdir, name, collationType, "meta", status);
    848    if (!f) {
    849        return;
    850    }
    851    // printf("writeCollationMetadataTOML %s %s\n", name, collationType);
    852    fprintf(f, "bits = 0x%X\n", metadataBits);
    853    fclose(f);
    854 }
    855 
    856 static UChar32
    857 writeCollationDiacriticsTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UErrorCode *status) {
    858    UChar32 limit = ICU4X_DIACRITIC_LIMIT;
    859    FILE* f = openTOML(outputdir, name, collationType, "dia", status);
    860    if (!f) {
    861        return limit;
    862    }
    863    // printf("writeCollationDiacriticsTOML %s %s\n", name, collationType);
    864    uint16_t secondaries[ICU4X_DIACRITIC_LIMIT-ICU4X_DIACRITIC_BASE];
    865    for (UChar32 c = ICU4X_DIACRITIC_BASE; c < ICU4X_DIACRITIC_LIMIT; ++c) {
    866        uint16_t secondary = 0;
    867        uint32_t ce32 = data->getCE32(c);
    868        if (ce32 == icu::Collation::FALLBACK_CE32) {
    869            ce32 = data->base->getCE32(c);
    870        }
    871        if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) {
    872            // These never occur in NFD data
    873        } else if (!icu::Collation::isSimpleOrLongCE32(ce32)) {
    874            if (uprv_strcmp(name, "root") == 0) {
    875                printf("UNSUPPORTED DIACRITIC CE32 in root: TAG: %X CE32: %X char: %X\n", icu::Collation::tagFromCE32(ce32), ce32, c);
    876                fclose(f);
    877                *status = U_INTERNAL_PROGRAM_ERROR;
    878                return limit;
    879            }
    880            limit = c;
    881            break;
    882        } else {
    883            uint64_t ce = static_cast<uint64_t>(icu::Collation::ceFromCE32(ce32));
    884            if ((ce & 0xFFFFFFFF0000FFFF) != static_cast<uint64_t>(icu::Collation::COMMON_TERTIARY_CE)) {
    885                // Not a CE where only the secondary weight differs from the expected
    886                // pattern.
    887                limit = c;
    888                break;
    889            }
    890            secondary = static_cast<uint16_t>(ce >> 16);
    891        }
    892        secondaries[c - ICU4X_DIACRITIC_BASE] = secondary;
    893 
    894    }
    895    usrc_writeArray(f, "secondaries = [\n  ", secondaries, 16, limit-ICU4X_DIACRITIC_BASE, "  ", "\n]\n");
    896    fclose(f);
    897    return limit;
    898 }
    899 
    900 static void
    901 writeCollationReorderingTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationSettings* settings, UErrorCode *status) {
    902    FILE* f = openTOML(outputdir, name, collationType, "reord", status);
    903    if (!f) {
    904        return;
    905    }
    906    // printf("writeCollationReorderingTOML %s %s\n", name, collationType);
    907    fprintf(f, "min_high_no_reorder = 0x%X\n", settings->minHighNoReorder);
    908    usrc_writeArray(f, "reorder_table = [\n  ", settings->reorderTable, 8, 256, "  ", "\n]\n");
    909    usrc_writeArray(f, "reorder_ranges = [\n  ", settings->reorderRanges, 32, settings->reorderRangesLength, "  ", "\n]\n");
    910    fclose(f);
    911 }
    912 
    913 
    914 static void
    915 writeCollationJamoTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UErrorCode *status) {
    916    FILE* f = openTOML(outputdir, name, collationType, "jamo", status);
    917    if (!f) {
    918        printf("writeCollationJamoTOML FAILED TO OPEN FILE %s %s\n", name, collationType);
    919        return;
    920    }
    921    uint32_t jamo[0x1200-0x1100];
    922    for (UChar32 c = 0x1100; c < 0x1200; ++c) {
    923        uint32_t ce32 = data->getCE32(c);
    924        if (ce32 == icu::Collation::FALLBACK_CE32) {
    925            ce32 = data->base->getCE32(c);
    926        }
    927        // Can't reject complex CE32s, because search collations have expansions.
    928        // These expansions refer to the tailoring, which foils the reuse of the
    929        // these jamo tables.
    930        // XXX Figure out what to do. Perhaps instead of having Latin mini expansions,
    931        // there should be Hangul mini expansions.
    932        // XXX in any case, validate that modern jamo are self-contained.
    933        jamo[c - 0x1100] = ce32;
    934 
    935    }
    936    usrc_writeArray(f, "ce32s = [\n  ", jamo, 32, 0x1200-0x1100, "  ", "\n]\n");
    937    fclose(f);
    938 }
    939 
    940 static UBool
    941 convertTrie(const void *context, UChar32 start, UChar32 end, uint32_t value) {
    942    if (start >= 0x1100 && start < 0x1200 && end >= 0x1100 && end < 0x1200) {
    943        // Range entirely in conjoining jamo block.
    944        return true;
    945    }
    946    icu::IcuToolErrorCode status("genrb: convertTrie");
    947    umutablecptrie_setRange((UMutableCPTrie*)context, start, end, value, status);
    948    return !U_FAILURE(*status);
    949 }
    950 
    951 static void
    952 writeCollationDataTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UBool root, UChar32 diacriticLimit, UErrorCode *status) {
    953    FILE* f = openTOML(outputdir, name, collationType, "data", status);
    954    if (!f) {
    955        return;
    956    }
    957    // printf("writeCollationDataTOML %s %s\n", name, collationType);
    958 
    959    icu::UnicodeSet tailoringSet;
    960 
    961    if (data->base) {
    962        tailoringSet.addAll(*(data->unsafeBackwardSet));
    963        tailoringSet.removeAll(*(data->base->unsafeBackwardSet));
    964    } else {
    965        tailoringSet.addAll(*(data->unsafeBackwardSet));
    966    }
    967 
    968    // Use the same value for out-of-range and default in the hope of not having to allocate
    969    // different blocks, since ICU4X never does out-of-range queries.
    970    uint32_t trieDefault = root ? icu::Collation::UNASSIGNED_CE32 : icu::Collation::FALLBACK_CE32;
    971    icu::LocalUMutableCPTriePointer builder(umutablecptrie_open(trieDefault, trieDefault, status));
    972 
    973    utrie2_enum(data->trie, nullptr, &convertTrie, builder.getAlias());
    974 
    975    // If the diacritic table was cut short, copy CE32s between the lowered
    976    // limit and the max limit from the root to the tailoring. As of June 2022,
    977    // no collation in CLDR needs this.
    978    for (UChar32 c = diacriticLimit; c < ICU4X_DIACRITIC_LIMIT; ++c) {
    979        if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) {
    980            // These never occur in NFD data.
    981            continue;
    982        }
    983        uint32_t ce32 = data->getCE32(c);
    984        if (ce32 == icu::Collation::FALLBACK_CE32) {
    985            ce32 = data->base->getCE32(c);
    986            umutablecptrie_set(builder.getAlias(), c, ce32, status);
    987        }
    988    }
    989 
    990    // Ensure that the range covered by the diacritic table isn't duplicated
    991    // in the trie.
    992    for (UChar32 c = ICU4X_DIACRITIC_BASE; c < diacriticLimit; ++c) {
    993        if (umutablecptrie_get(builder.getAlias(), c) != trieDefault) {
    994            umutablecptrie_set(builder.getAlias(), c, trieDefault, status);
    995        }
    996    }
    997 
    998    icu::LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
    999    builder.getAlias(),
   1000    UCPTRIE_TYPE_SMALL,
   1001    UCPTRIE_VALUE_BITS_32,
   1002    status));
   1003    usrc_writeArray(f, "contexts = [\n  ", data->contexts, 16, data->contextsLength, "  ", "\n]\n");
   1004    usrc_writeArray(f, "ce32s = [\n  ", data->ce32s, 32, data->ce32sLength, "  ", "\n]\n");
   1005    usrc_writeArray(f, "ces = [\n  ", data->ces, 64, data->cesLength, "  ", "\n]\n");
   1006    fprintf(f, "[trie]\n");
   1007    usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
   1008 
   1009    fclose(f);
   1010 }
   1011 
   1012 static void
   1013 writeCollationSpecialPrimariesTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UErrorCode *status) {
   1014    FILE* f = openTOML(outputdir, name, collationType, "prim", status);
   1015    if (!f) {
   1016        return;
   1017    }
   1018    // printf("writeCollationSpecialPrimariesTOML %s %s\n", name, collationType);
   1019 
   1020    uint16_t lastPrimaries[4];
   1021    for (int32_t i = 0; i < 4; ++i) {
   1022        // getLastPrimaryForGroup subtracts one from a 16-bit value, so we add one
   1023        // back to get a value that fits in 16 bits.
   1024        lastPrimaries[i] = static_cast<uint16_t>((data->getLastPrimaryForGroup(UCOL_REORDER_CODE_FIRST + i) + 1) >> 16);
   1025    }
   1026 
   1027    uint32_t numericPrimary = data->numericPrimary;
   1028    if (numericPrimary & 0xFFFFFF) {
   1029        printf("Lower 24 bits set in numeric primary");
   1030        *status = U_INTERNAL_PROGRAM_ERROR;
   1031        return;
   1032    }
   1033 
   1034    usrc_writeArray(f, "last_primaries = [\n  ", lastPrimaries, 16, 4, "  ", "\n]\n");
   1035    usrc_writeArray(f, "compressible_bytes = [\n  ", data->compressibleBytes, 1, 256, "  ", "\n]\n");
   1036    fprintf(f, "numeric_primary = 0x%X\n", numericPrimary >> 24);
   1037    fclose(f);
   1038 }
   1039 
   1040 static void
   1041 writeCollationTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, const icu::CollationSettings* settings, UErrorCode *status) {
   1042    UBool tailored = false;
   1043    UBool tailoredDiacritics = false;
   1044    UBool lithuanianDotAbove = (uprv_strcmp(name, "lt") == 0);
   1045    UBool reordering = false;
   1046    UBool isRoot = uprv_strcmp(name, "root") == 0;
   1047    UChar32 diacriticLimit = ICU4X_DIACRITIC_LIMIT;
   1048    if (!data->base && isRoot) {
   1049        diacriticLimit = writeCollationDiacriticsTOML(outputdir, name, collationType, data, status);
   1050        if (U_FAILURE(*status)) {
   1051            return;
   1052        }
   1053        writeCollationJamoTOML(outputdir, name, collationType, data, status);
   1054        if (U_FAILURE(*status)) {
   1055            return;
   1056        }
   1057        writeCollationSpecialPrimariesTOML(outputdir, name, collationType, data, status);
   1058        if (U_FAILURE(*status)) {
   1059            return;
   1060        }
   1061    } else if (data->base && !lithuanianDotAbove) {
   1062        for (UChar32 c = ICU4X_DIACRITIC_BASE; c < ICU4X_DIACRITIC_LIMIT; ++c) {
   1063            if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) {
   1064                // These never occur in NFD data.
   1065                continue;
   1066            }
   1067            uint32_t ce32 = data->getCE32(c);
   1068            if ((ce32 != icu::Collation::FALLBACK_CE32) && (ce32 != data->base->getCE32(c))) {
   1069                tailoredDiacritics = true;
   1070                diacriticLimit = writeCollationDiacriticsTOML(outputdir, name, collationType, data, status);
   1071                if (U_FAILURE(*status)) {
   1072                    return;
   1073                }
   1074                break;
   1075            }
   1076        }
   1077    }
   1078 
   1079    if (settings->hasReordering()) {
   1080        reordering = true;
   1081        // Note: There are duplicate reorderings. Expecting the ICU4X provider
   1082        // to take care of deduplication.
   1083        writeCollationReorderingTOML(outputdir, name, collationType, settings, status);
   1084        if (U_FAILURE(*status)) {
   1085            return;
   1086        }
   1087    }
   1088 
   1089    // Write collation data if either base is non-null or the name is root.
   1090    // Languages that only reorder scripts are otherwise root-like and have
   1091    // null base.
   1092    if (data->base || isRoot) {
   1093        tailored = !isRoot;
   1094        writeCollationDataTOML(outputdir, name, collationType, data, (!data->base && isRoot), diacriticLimit, status);
   1095        if (U_FAILURE(*status)) {
   1096            return;
   1097        }
   1098    }
   1099 
   1100    uint32_t maxVariable = static_cast<uint32_t>(settings->getMaxVariable());
   1101    if (maxVariable >= 4) {
   1102        printf("Max variable out of range");
   1103        *status = U_INTERNAL_PROGRAM_ERROR;
   1104        return;
   1105    }
   1106 
   1107    uint32_t metadataBits = maxVariable;
   1108    if (tailored) {
   1109        metadataBits |= (1 << 3);
   1110    }
   1111    if (tailoredDiacritics) {
   1112        metadataBits |= (1 << 4);
   1113    }
   1114    if (reordering) {
   1115        metadataBits |= (1 << 5);
   1116    }
   1117    if (lithuanianDotAbove) {
   1118        metadataBits |= (1 << 6);
   1119    }
   1120    if ((settings->options & icu::CollationSettings::BACKWARD_SECONDARY) != 0) {
   1121        metadataBits |= (1 << 7);
   1122    }
   1123    if (settings->getAlternateHandling() == UCOL_SHIFTED) {
   1124        metadataBits |= (1 << 8);
   1125    }
   1126    switch (settings->getCaseFirst()) {
   1127        case UCOL_OFF:
   1128            break;
   1129        case UCOL_UPPER_FIRST:
   1130            metadataBits |= (1 << 9);
   1131            metadataBits |= (1 << 10);
   1132            break;
   1133        case UCOL_LOWER_FIRST:
   1134            metadataBits |= (1 << 9);
   1135            break;
   1136        default:
   1137            *status = U_INTERNAL_PROGRAM_ERROR;
   1138            return;
   1139    }
   1140 
   1141    writeCollationMetadataTOML(outputdir, name, collationType, metadataBits, status);
   1142 }
   1143 
   1144 #endif  // !UCONFIG_NO_COLLATION
   1145 
   1146 static TableResource *
   1147 addCollation(ParseState* state, TableResource  *result, const char *collationType,
   1148             uint32_t startline, UErrorCode *status)
   1149 {
   1150    // TODO: Use LocalPointer for result, or make caller close it when there is a failure.
   1151    struct SResource  *member = nullptr;
   1152    struct UString    *tokenValue;
   1153    struct UString     comment;
   1154    enum   ETokenType  token;
   1155    CharString         subtag;
   1156    UnicodeString      rules;
   1157    UBool              haveRules = false;
   1158    UVersionInfo       version;
   1159    uint32_t           line;
   1160 
   1161    /* '{' . (name resource)* '}' */
   1162    version[0]=0; version[1]=0; version[2]=0; version[3]=0;
   1163 
   1164    for (;;)
   1165    {
   1166        ustr_init(&comment);
   1167        token = getToken(state, &tokenValue, &comment, &line, status);
   1168 
   1169        if (token == TOK_CLOSE_BRACE)
   1170        {
   1171            break;
   1172        }
   1173 
   1174        if (token != TOK_STRING)
   1175        {
   1176            res_close(result);
   1177            *status = U_INVALID_FORMAT_ERROR;
   1178 
   1179            if (token == TOK_EOF)
   1180            {
   1181                error(startline, "unterminated table");
   1182            }
   1183            else
   1184            {
   1185                error(line, "Unexpected token %s", tokenNames[token]);
   1186            }
   1187 
   1188            return nullptr;
   1189        }
   1190 
   1191        subtag.clear();
   1192        subtag.appendInvariantChars(tokenValue->fChars, u_strlen(tokenValue->fChars), *status);
   1193        if (U_FAILURE(*status))
   1194        {
   1195            res_close(result);
   1196            return nullptr;
   1197        }
   1198 
   1199        member = parseResource(state, subtag.data(), nullptr, status);
   1200 
   1201        if (U_FAILURE(*status))
   1202        {
   1203            res_close(result);
   1204            return nullptr;
   1205        }
   1206        if (result == nullptr)
   1207        {
   1208            // Ignore the parsed resources, continue parsing.
   1209        }
   1210        else if (uprv_strcmp(subtag.data(), "Version") == 0 && member->isString())
   1211        {
   1212            StringResource *sr = static_cast<StringResource *>(member);
   1213            char     ver[40];
   1214            int32_t length = sr->length();
   1215 
   1216            if (length >= UPRV_LENGTHOF(ver))
   1217            {
   1218                length = UPRV_LENGTHOF(ver) - 1;
   1219            }
   1220 
   1221            sr->fString.extract(0, length, ver, UPRV_LENGTHOF(ver), US_INV);
   1222            u_versionFromString(version, ver);
   1223 
   1224            result->add(member, line, *status);
   1225            member = nullptr;
   1226        }
   1227        else if(uprv_strcmp(subtag.data(), "%%CollationBin")==0)
   1228        {
   1229            /* discard duplicate %%CollationBin if any*/
   1230        }
   1231        else if (uprv_strcmp(subtag.data(), "Sequence") == 0 && member->isString())
   1232        {
   1233            StringResource *sr = static_cast<StringResource *>(member);
   1234            rules = sr->fString;
   1235            haveRules = true;
   1236            // Defer building the collator until we have seen
   1237            // all sub-elements of the collation table, including the Version.
   1238            /* in order to achieve smaller data files, we can direct genrb */
   1239            /* to omit collation rules */
   1240            if(!state->omitCollationRules) {
   1241                result->add(member, line, *status);
   1242                member = nullptr;
   1243            }
   1244        }
   1245        else  // Just copy non-special items.
   1246        {
   1247            result->add(member, line, *status);
   1248            member = nullptr;
   1249        }
   1250        res_close(member);  // TODO: use LocalPointer
   1251        if (U_FAILURE(*status))
   1252        {
   1253            res_close(result);
   1254            return nullptr;
   1255        }
   1256    }
   1257 
   1258    if (!haveRules) { return result; }
   1259 
   1260 #if UCONFIG_NO_COLLATION || UCONFIG_NO_FILE_IO
   1261    warning(line, "Not building collation elements because of UCONFIG_NO_COLLATION and/or UCONFIG_NO_FILE_IO, see uconfig.h");
   1262    (void)collationType;
   1263 #else
   1264    // CLDR ticket #3949, ICU ticket #8082:
   1265    // Do not build collation binary data for for-import-only "private" collation rule strings.
   1266    if (uprv_strncmp(collationType, "private-", 8) == 0) {
   1267        if(isVerbose()) {
   1268            printf("Not building %s~%s collation binary\n", state->filename, collationType);
   1269        }
   1270        return result;
   1271    }
   1272 
   1273    if(!state->makeBinaryCollation) {
   1274        if(isVerbose()) {
   1275            printf("Not building %s~%s collation binary\n", state->filename, collationType);
   1276        }
   1277        return result;
   1278    }
   1279    UErrorCode intStatus = U_ZERO_ERROR;
   1280    UParseError parseError;
   1281    uprv_memset(&parseError, 0, sizeof(parseError));
   1282    GenrbImporter importer(state->inputdir, state->outputdir);
   1283    const icu::CollationTailoring *base = icu::CollationRoot::getRoot(intStatus);
   1284    if(U_FAILURE(intStatus)) {
   1285        error(line, "failed to load root collator (ucadata.icu) - %s", u_errorName(intStatus));
   1286        res_close(result);
   1287        return nullptr;  // TODO: use LocalUResourceBundlePointer for result
   1288    }
   1289    icu::CollationBuilder builder(base, state->icu4xMode, intStatus);
   1290    if(state->icu4xMode || (uprv_strncmp(collationType, "search", 6) == 0)) {
   1291        builder.disableFastLatin();  // build fast-Latin table unless search collator or ICU4X
   1292    }
   1293    LocalPointer<icu::CollationTailoring> t(
   1294            builder.parseAndBuild(rules, version, &importer, &parseError, intStatus));
   1295    if(U_FAILURE(intStatus)) {
   1296        const char *reason = builder.getErrorReason();
   1297        if(reason == nullptr) { reason = ""; }
   1298        error(line, "CollationBuilder failed at %s~%s/Sequence rule offset %ld: %s  %s",
   1299                state->filename, collationType,
   1300                static_cast<long>(parseError.offset), u_errorName(intStatus), reason);
   1301        if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) {
   1302            // Print pre- and post-context.
   1303            char preBuffer[100], postBuffer[100];
   1304            escape(parseError.preContext, preBuffer, sizeof(preBuffer));
   1305            escape(parseError.postContext, postBuffer, sizeof(postBuffer));
   1306            error(line, "  error context: \"...%s\" ! \"%s...\"", preBuffer, postBuffer);
   1307        }
   1308        if(isStrict() || t.isNull()) {
   1309            *status = intStatus;
   1310            res_close(result);
   1311            return nullptr;
   1312        }
   1313    }
   1314    if (state->icu4xMode) {
   1315        char *nameWithoutSuffix = static_cast<char *>(uprv_malloc(uprv_strlen(state->filename) + 1));
   1316        if (nameWithoutSuffix == nullptr) {
   1317            *status = U_MEMORY_ALLOCATION_ERROR;
   1318            res_close(result);
   1319            return nullptr;
   1320        }
   1321        uprv_strcpy(nameWithoutSuffix, state->filename);
   1322        *uprv_strrchr(nameWithoutSuffix, '.') = 0;
   1323 
   1324        writeCollationTOML(state->outputdir, nameWithoutSuffix, collationType, t->data, t->settings, status);
   1325        uprv_free(nameWithoutSuffix);
   1326    }
   1327    icu::LocalMemory<uint8_t> buffer;
   1328    int32_t capacity = 100000;
   1329    uint8_t *dest = buffer.allocateInsteadAndCopy(capacity);
   1330    if(dest == nullptr) {
   1331        fprintf(stderr, "memory allocation (%ld bytes) for file contents failed\n",
   1332                static_cast<long>(capacity));
   1333        *status = U_MEMORY_ALLOCATION_ERROR;
   1334        res_close(result);
   1335        return nullptr;
   1336    }
   1337    int32_t indexes[icu::CollationDataReader::IX_TOTAL_SIZE + 1];
   1338    int32_t totalSize = icu::CollationDataWriter::writeTailoring(
   1339            *t, *t->settings, indexes, dest, capacity, intStatus);
   1340    if(intStatus == U_BUFFER_OVERFLOW_ERROR) {
   1341        intStatus = U_ZERO_ERROR;
   1342        capacity = totalSize;
   1343        dest = buffer.allocateInsteadAndCopy(capacity);
   1344        if(dest == nullptr) {
   1345            fprintf(stderr, "memory allocation (%ld bytes) for file contents failed\n",
   1346                    static_cast<long>(capacity));
   1347            *status = U_MEMORY_ALLOCATION_ERROR;
   1348            res_close(result);
   1349            return nullptr;
   1350        }
   1351        totalSize = icu::CollationDataWriter::writeTailoring(
   1352                *t, *t->settings, indexes, dest, capacity, intStatus);
   1353    }
   1354    if(U_FAILURE(intStatus)) {
   1355        fprintf(stderr, "CollationDataWriter::writeTailoring() failed: %s\n",
   1356                u_errorName(intStatus));
   1357        res_close(result);
   1358        return nullptr;
   1359    }
   1360    if(isVerbose()) {
   1361        printf("%s~%s collation tailoring part sizes:\n", state->filename, collationType);
   1362        icu::CollationInfo::printSizes(totalSize, indexes);
   1363        if(t->settings->hasReordering()) {
   1364            printf("%s~%s collation reordering ranges:\n", state->filename, collationType);
   1365            icu::CollationInfo::printReorderRanges(
   1366                    *t->data, t->settings->reorderCodes, t->settings->reorderCodesLength);
   1367        }
   1368 #if 0  // debugging output
   1369    } else {
   1370        printf("%s~%s collation tailoring part sizes:\n", state->filename, collationType);
   1371        icu::CollationInfo::printSizes(totalSize, indexes);
   1372 #endif
   1373    }
   1374    struct SResource *collationBin = bin_open(state->bundle, "%%CollationBin", totalSize, dest, nullptr, nullptr, status);
   1375    result->add(collationBin, line, *status);
   1376    if (U_FAILURE(*status)) {
   1377        res_close(result);
   1378        return nullptr;
   1379    }
   1380 #endif
   1381    return result;
   1382 }
   1383 
   1384 static UBool
   1385 keepCollationType(const char * /*type*/) {
   1386    return true;
   1387 }
   1388 
   1389 static struct SResource *
   1390 parseCollationElements(ParseState* state, char *tag, uint32_t startline, UBool newCollation, UErrorCode *status)
   1391 {
   1392    TableResource  *result = nullptr;
   1393    struct SResource  *member = nullptr;
   1394    struct UString    *tokenValue;
   1395    struct UString     comment;
   1396    enum   ETokenType  token;
   1397    CharString         subtag, typeKeyword;
   1398    uint32_t           line;
   1399 
   1400    result = table_open(state->bundle, tag, nullptr, status);
   1401 
   1402    if (result == nullptr || U_FAILURE(*status))
   1403    {
   1404        return nullptr;
   1405    }
   1406    if(isVerbose()){
   1407        printf(" collation elements %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline));
   1408    }
   1409    if(!newCollation) {
   1410        return addCollation(state, result, "(no type)", startline, status);
   1411    }
   1412    else {
   1413        for(;;) {
   1414            ustr_init(&comment);
   1415            token = getToken(state, &tokenValue, &comment, &line, status);
   1416 
   1417            if (token == TOK_CLOSE_BRACE)
   1418            {
   1419                return result;
   1420            }
   1421 
   1422            if (token != TOK_STRING)
   1423            {
   1424                res_close(result);
   1425                *status = U_INVALID_FORMAT_ERROR;
   1426 
   1427                if (token == TOK_EOF)
   1428                {
   1429                    error(startline, "unterminated table");
   1430                }
   1431                else
   1432                {
   1433                    error(line, "Unexpected token %s", tokenNames[token]);
   1434                }
   1435 
   1436                return nullptr;
   1437            }
   1438 
   1439            subtag.clear();
   1440            subtag.appendInvariantChars(tokenValue->fChars, u_strlen(tokenValue->fChars), *status);
   1441 
   1442            if (U_FAILURE(*status))
   1443            {
   1444                res_close(result);
   1445                return nullptr;
   1446            }
   1447 
   1448            if (uprv_strcmp(subtag.data(), "default") == 0)
   1449            {
   1450                member = parseResource(state, subtag.data(), nullptr, status);
   1451 
   1452                if (U_FAILURE(*status))
   1453                {
   1454                    res_close(result);
   1455                    return nullptr;
   1456                }
   1457 
   1458                result->add(member, line, *status);
   1459            }
   1460            else
   1461            {
   1462                token = peekToken(state, 0, &tokenValue, &line, &comment, status);
   1463                /* this probably needs to be refactored or recursively use the parser */
   1464                /* first we assume that our collation table won't have the explicit type */
   1465                /* then, we cannot handle aliases */
   1466                if(token == TOK_OPEN_BRACE) {
   1467                    token = getToken(state, &tokenValue, &comment, &line, status);
   1468                    TableResource *collationRes;
   1469                    if (keepCollationType(subtag.data())) {
   1470                        collationRes = table_open(state->bundle, subtag.data(), nullptr, status);
   1471                    } else {
   1472                        collationRes = nullptr;
   1473                    }
   1474                    // need to parse the collation data regardless
   1475                    collationRes = addCollation(state, collationRes, subtag.data(), startline, status);
   1476                    if (collationRes != nullptr) {
   1477                        result->add(collationRes, startline, *status);
   1478                    }
   1479                } else if(token == TOK_COLON) { /* right now, we'll just try to see if we have aliases */
   1480                    /* we could have a table too */
   1481                    token = peekToken(state, 1, &tokenValue, &line, &comment, status);
   1482                    typeKeyword.clear();
   1483                    typeKeyword.appendInvariantChars(tokenValue->fChars, u_strlen(tokenValue->fChars), *status);
   1484                    if (U_FAILURE(*status))
   1485                    {
   1486                        res_close(result);
   1487                        return nullptr;
   1488                    }
   1489 
   1490                    if(uprv_strcmp(typeKeyword.data(), "alias") == 0) {
   1491                        member = parseResource(state, subtag.data(), nullptr, status);
   1492                        if (U_FAILURE(*status))
   1493                        {
   1494                            res_close(result);
   1495                            return nullptr;
   1496                        }
   1497 
   1498                        result->add(member, line, *status);
   1499                    } else {
   1500                        res_close(result);
   1501                        *status = U_INVALID_FORMAT_ERROR;
   1502                        return nullptr;
   1503                    }
   1504                } else {
   1505                    res_close(result);
   1506                    *status = U_INVALID_FORMAT_ERROR;
   1507                    return nullptr;
   1508                }
   1509            }
   1510 
   1511            /*member = string_open(bundle, subtag, tokenValue->fChars, tokenValue->fLength, status);*/
   1512 
   1513            /*expect(TOK_CLOSE_BRACE, nullptr, nullptr, status);*/
   1514 
   1515            if (U_FAILURE(*status))
   1516            {
   1517                res_close(result);
   1518                return nullptr;
   1519            }
   1520        }
   1521    }
   1522 }
   1523 
   1524 /* Necessary, because CollationElements requires the bundle->fRoot member to be present which,
   1525   if this weren't special-cased, wouldn't be set until the entire file had been processed. */
   1526 static struct SResource *
   1527 realParseTable(ParseState* state, TableResource *table, char *tag, uint32_t startline, UErrorCode *status)
   1528 {
   1529    struct SResource  *member = nullptr;
   1530    struct UString    *tokenValue=nullptr;
   1531    struct UString    comment;
   1532    enum   ETokenType token;
   1533    CharString        subtag;
   1534    uint32_t          line;
   1535    UBool             readToken = false;
   1536 
   1537    /* '{' . (name resource)* '}' */
   1538 
   1539    if(isVerbose()){
   1540        printf(" parsing table %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline));
   1541    }
   1542    for (;;)
   1543    {
   1544        ustr_init(&comment);
   1545        token = getToken(state, &tokenValue, &comment, &line, status);
   1546 
   1547        if (token == TOK_CLOSE_BRACE)
   1548        {
   1549            if (!readToken && isVerbose()) {
   1550                warning(startline, "Encountered empty table");
   1551            }
   1552            return table;
   1553        }
   1554 
   1555        if (token != TOK_STRING)
   1556        {
   1557            *status = U_INVALID_FORMAT_ERROR;
   1558 
   1559            if (token == TOK_EOF)
   1560            {
   1561                error(startline, "unterminated table");
   1562            }
   1563            else
   1564            {
   1565                error(line, "unexpected token %s", tokenNames[token]);
   1566            }
   1567 
   1568            return nullptr;
   1569        }
   1570 
   1571        if(uprv_isInvariantUString(tokenValue->fChars, -1)) {
   1572            subtag.clear();
   1573            subtag.appendInvariantChars(tokenValue->fChars, u_strlen(tokenValue->fChars), *status);
   1574        } else {
   1575            *status = U_INVALID_FORMAT_ERROR;
   1576            error(line, "invariant characters required for table keys");
   1577            return nullptr;
   1578        }
   1579 
   1580        if (U_FAILURE(*status))
   1581        {
   1582            error(line, "parse error. Stopped parsing tokens with %s", u_errorName(*status));
   1583            return nullptr;
   1584        }
   1585 
   1586        member = parseResource(state, subtag.data(), &comment, status);
   1587 
   1588        if (member == nullptr || U_FAILURE(*status))
   1589        {
   1590            error(line, "parse error. Stopped parsing resource with %s", u_errorName(*status));
   1591            return nullptr;
   1592        }
   1593 
   1594        table->add(member, line, *status);
   1595 
   1596        if (U_FAILURE(*status))
   1597        {
   1598            error(line, "parse error. Stopped parsing table with %s", u_errorName(*status));
   1599            return nullptr;
   1600        }
   1601        readToken = true;
   1602        ustr_deinit(&comment);
   1603   }
   1604 
   1605    /* not reached */
   1606    /* A compiler warning will appear if all paths don't contain a return statement. */
   1607 /*     *status = U_INTERNAL_PROGRAM_ERROR;
   1608     return nullptr;*/
   1609 }
   1610 
   1611 static struct SResource *
   1612 parseTable(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status)
   1613 {
   1614    if (tag != nullptr && uprv_strcmp(tag, "CollationElements") == 0)
   1615    {
   1616        return parseCollationElements(state, tag, startline, false, status);
   1617    }
   1618    if (tag != nullptr && uprv_strcmp(tag, "collations") == 0)
   1619    {
   1620        return parseCollationElements(state, tag, startline, true, status);
   1621    }
   1622    if(isVerbose()){
   1623        printf(" table %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline));
   1624    }
   1625 
   1626    TableResource *result = table_open(state->bundle, tag, comment, status);
   1627 
   1628    if (result == nullptr || U_FAILURE(*status))
   1629    {
   1630        return nullptr;
   1631    }
   1632    return realParseTable(state, result, tag, startline,  status);
   1633 }
   1634 
   1635 static struct SResource *
   1636 parseArray(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status)
   1637 {
   1638    struct SResource  *member = nullptr;
   1639    struct UString    *tokenValue;
   1640    struct UString    memberComments;
   1641    enum   ETokenType token;
   1642    UBool             readToken = false;
   1643 
   1644    ArrayResource  *result = array_open(state->bundle, tag, comment, status);
   1645 
   1646    if (result == nullptr || U_FAILURE(*status))
   1647    {
   1648        return nullptr;
   1649    }
   1650    if(isVerbose()){
   1651        printf(" array %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline));
   1652    }
   1653 
   1654    ustr_init(&memberComments);
   1655 
   1656    /* '{' . resource [','] '}' */
   1657    for (;;)
   1658    {
   1659        /* reset length */
   1660        ustr_setlen(&memberComments, 0, status);
   1661 
   1662        /* check for end of array, but don't consume next token unless it really is the end */
   1663        token = peekToken(state, 0, &tokenValue, nullptr, &memberComments, status);
   1664 
   1665 
   1666        if (token == TOK_CLOSE_BRACE)
   1667        {
   1668            getToken(state, nullptr, nullptr, nullptr, status);
   1669            if (!readToken) {
   1670                warning(startline, "Encountered empty array");
   1671            }
   1672            break;
   1673        }
   1674 
   1675        if (token == TOK_EOF)
   1676        {
   1677            res_close(result);
   1678            *status = U_INVALID_FORMAT_ERROR;
   1679            error(startline, "unterminated array");
   1680            return nullptr;
   1681        }
   1682 
   1683        /* string arrays are a special case */
   1684        if (token == TOK_STRING)
   1685        {
   1686            getToken(state, &tokenValue, &memberComments, nullptr, status);
   1687            member = string_open(state->bundle, nullptr, tokenValue->fChars, tokenValue->fLength, &memberComments, status);
   1688        }
   1689        else
   1690        {
   1691            member = parseResource(state, nullptr, &memberComments, status);
   1692        }
   1693 
   1694        if (member == nullptr || U_FAILURE(*status))
   1695        {
   1696            res_close(result);
   1697            return nullptr;
   1698        }
   1699 
   1700        result->add(member);
   1701 
   1702        /* eat optional comma if present */
   1703        token = peekToken(state, 0, nullptr, nullptr, nullptr, status);
   1704 
   1705        if (token == TOK_COMMA)
   1706        {
   1707            getToken(state, nullptr, nullptr, nullptr, status);
   1708        }
   1709 
   1710        if (U_FAILURE(*status))
   1711        {
   1712            res_close(result);
   1713            return nullptr;
   1714        }
   1715        readToken = true;
   1716    }
   1717 
   1718    ustr_deinit(&memberComments);
   1719    return result;
   1720 }
   1721 
   1722 static struct SResource *
   1723 parseIntVector(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status)
   1724 {
   1725    enum   ETokenType  token;
   1726    char              *string;
   1727    int32_t            value;
   1728    UBool              readToken = false;
   1729    char              *stopstring;
   1730    struct UString     memberComments;
   1731 
   1732    IntVectorResource *result = intvector_open(state->bundle, tag, comment, status);
   1733 
   1734    if (result == nullptr || U_FAILURE(*status))
   1735    {
   1736        return nullptr;
   1737    }
   1738 
   1739    if(isVerbose()){
   1740        printf(" vector %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline));
   1741    }
   1742    ustr_init(&memberComments);
   1743    /* '{' . string [','] '}' */
   1744    for (;;)
   1745    {
   1746        ustr_setlen(&memberComments, 0, status);
   1747 
   1748        /* check for end of array, but don't consume next token unless it really is the end */
   1749        token = peekToken(state, 0, nullptr, nullptr,&memberComments, status);
   1750 
   1751        if (token == TOK_CLOSE_BRACE)
   1752        {
   1753            /* it's the end, consume the close brace */
   1754            getToken(state, nullptr, nullptr, nullptr, status);
   1755            if (!readToken) {
   1756                warning(startline, "Encountered empty int vector");
   1757            }
   1758            ustr_deinit(&memberComments);
   1759            return result;
   1760        }
   1761 
   1762        int32_t stringLength;
   1763        string = getInvariantString(state, nullptr, nullptr, stringLength, status);
   1764 
   1765        if (U_FAILURE(*status))
   1766        {
   1767            res_close(result);
   1768            return nullptr;
   1769        }
   1770 
   1771        /* For handling illegal char in the Intvector */
   1772        value = uprv_strtoul(string, &stopstring, 0);/* make intvector support decimal,hexdigit,octal digit ranging from -2^31-2^32-1*/
   1773        int32_t len = static_cast<int32_t>(stopstring - string);
   1774 
   1775        if(len==stringLength)
   1776        {
   1777            result->add(value, *status);
   1778            uprv_free(string);
   1779            token = peekToken(state, 0, nullptr, nullptr, nullptr, status);
   1780        }
   1781        else
   1782        {
   1783            uprv_free(string);
   1784            *status=U_INVALID_CHAR_FOUND;
   1785        }
   1786 
   1787        if (U_FAILURE(*status))
   1788        {
   1789            res_close(result);
   1790            return nullptr;
   1791        }
   1792 
   1793        /* the comma is optional (even though it is required to prevent the reader from concatenating
   1794        consecutive entries) so that a missing comma on the last entry isn't an error */
   1795        if (token == TOK_COMMA)
   1796        {
   1797            getToken(state, nullptr, nullptr, nullptr, status);
   1798        }
   1799        readToken = true;
   1800    }
   1801 
   1802    /* not reached */
   1803    /* A compiler warning will appear if all paths don't contain a return statement. */
   1804 /*    intvector_close(result, status);
   1805    *status = U_INTERNAL_PROGRAM_ERROR;
   1806    return nullptr;*/
   1807 }
   1808 
   1809 static struct SResource *
   1810 parseBinary(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status)
   1811 {
   1812    uint32_t line;
   1813    int32_t stringLength;
   1814    LocalMemory<char> string(getInvariantString(state, &line, nullptr, stringLength, status));
   1815    if (string.isNull() || U_FAILURE(*status))
   1816    {
   1817        return nullptr;
   1818    }
   1819 
   1820    expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status);
   1821    if (U_FAILURE(*status))
   1822    {
   1823        return nullptr;
   1824    }
   1825 
   1826    if(isVerbose()){
   1827        printf(" binary %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline));
   1828    }
   1829 
   1830    LocalMemory<uint8_t> value;
   1831    int32_t count = 0;
   1832    if (stringLength > 0 && value.allocateInsteadAndCopy(stringLength) == nullptr)
   1833    {
   1834        *status = U_MEMORY_ALLOCATION_ERROR;
   1835        return nullptr;
   1836    }
   1837 
   1838    char toConv[3] = {'\0', '\0', '\0'};
   1839    for (int32_t i = 0; i < stringLength;)
   1840    {
   1841        // Skip spaces (which may have been line endings).
   1842        char c0 = string[i++];
   1843        if (c0 == ' ') { continue; }
   1844        if (i == stringLength) {
   1845            *status=U_INVALID_CHAR_FOUND;
   1846            error(line, "Encountered invalid binary value (odd number of hex digits)");
   1847            return nullptr;
   1848        }
   1849        toConv[0] = c0;
   1850        toConv[1] = string[i++];
   1851 
   1852        char *stopstring;
   1853        value[count++] = static_cast<uint8_t>(uprv_strtoul(toConv, &stopstring, 16));
   1854        uint32_t len = static_cast<uint32_t>(stopstring - toConv);
   1855 
   1856        if(len!=2)
   1857        {
   1858            *status=U_INVALID_CHAR_FOUND;
   1859            error(line, "Encountered invalid binary value (not all pairs of hex digits)");
   1860            return nullptr;
   1861        }
   1862    }
   1863 
   1864    if (count == 0) {
   1865        warning(startline, "Encountered empty binary value");
   1866        return bin_open(state->bundle, tag, 0, nullptr, "", comment, status);
   1867    } else {
   1868        return bin_open(state->bundle, tag, count, value.getAlias(), nullptr, comment, status);
   1869    }
   1870 }
   1871 
   1872 static struct SResource *
   1873 parseInteger(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status)
   1874 {
   1875    struct SResource *result = nullptr;
   1876    int32_t           value;
   1877    char             *string;
   1878    char             *stopstring;
   1879 
   1880    int32_t stringLength;
   1881    string = getInvariantString(state, nullptr, nullptr, stringLength, status);
   1882 
   1883    if (string == nullptr || U_FAILURE(*status))
   1884    {
   1885        return nullptr;
   1886    }
   1887 
   1888    expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status);
   1889 
   1890    if (U_FAILURE(*status))
   1891    {
   1892        uprv_free(string);
   1893        return nullptr;
   1894    }
   1895 
   1896    if(isVerbose()){
   1897        printf(" integer %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline));
   1898    }
   1899 
   1900    if (stringLength == 0)
   1901    {
   1902        warning(startline, "Encountered empty integer. Default value is 0.");
   1903    }
   1904 
   1905    /* Allow integer support for hexdecimal, octal digit and decimal*/
   1906    /* and handle illegal char in the integer*/
   1907    value = uprv_strtoul(string, &stopstring, 0);
   1908    int32_t len = static_cast<int32_t>(stopstring - string);
   1909    if(len==stringLength)
   1910    {
   1911        result = int_open(state->bundle, tag, value, comment, status);
   1912    }
   1913    else
   1914    {
   1915        *status=U_INVALID_CHAR_FOUND;
   1916    }
   1917    uprv_free(string);
   1918 
   1919    return result;
   1920 }
   1921 
   1922 static struct SResource *
   1923 parseImport(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status)
   1924 {
   1925    uint32_t          line;
   1926    int32_t stringLength;
   1927    LocalMemory<char> filename(getInvariantString(state, &line, nullptr, stringLength, status));
   1928    if (U_FAILURE(*status))
   1929    {
   1930        return nullptr;
   1931    }
   1932 
   1933    expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status);
   1934 
   1935    if (U_FAILURE(*status))
   1936    {
   1937        return nullptr;
   1938    }
   1939 
   1940    if(isVerbose()){
   1941        printf(" import %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline));
   1942    }
   1943 
   1944    /* Open the input file for reading */
   1945    CharString fullname;
   1946    if (state->inputdir != nullptr) {
   1947        fullname.append(state->inputdir, *status);
   1948    }
   1949    fullname.appendPathPart(filename.getAlias(), *status);
   1950    if (U_FAILURE(*status)) {
   1951        return nullptr;
   1952    }
   1953 
   1954    FileStream *file = T_FileStream_open(fullname.data(), "rb");
   1955    if (file == nullptr)
   1956    {
   1957        error(line, "couldn't open input file %s", filename.getAlias());
   1958        *status = U_FILE_ACCESS_ERROR;
   1959        return nullptr;
   1960    }
   1961 
   1962    int32_t len  = T_FileStream_size(file);
   1963    LocalMemory<uint8_t> data;
   1964    if(data.allocateInsteadAndCopy(len) == nullptr)
   1965    {
   1966        *status = U_MEMORY_ALLOCATION_ERROR;
   1967        T_FileStream_close (file);
   1968        return nullptr;
   1969    }
   1970 
   1971    /* int32_t numRead = */ T_FileStream_read(file, data.getAlias(), len);
   1972    T_FileStream_close (file);
   1973 
   1974    return bin_open(state->bundle, tag, len, data.getAlias(), fullname.data(), comment, status);
   1975 }
   1976 
   1977 static struct SResource *
   1978 parseInclude(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status)
   1979 {
   1980    struct SResource *result;
   1981    int32_t           len=0;
   1982    char             *filename;
   1983    uint32_t          line;
   1984    char16_t *pTarget     = nullptr;
   1985 
   1986    UCHARBUF *ucbuf;
   1987    char     *fullname = nullptr;
   1988    const char* cp = nullptr;
   1989    const char16_t* uBuffer = nullptr;
   1990 
   1991    int32_t stringLength;
   1992    filename = getInvariantString(state, &line, nullptr, stringLength, status);
   1993 
   1994    if (U_FAILURE(*status))
   1995    {
   1996        return nullptr;
   1997    }
   1998 
   1999    expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status);
   2000 
   2001    if (U_FAILURE(*status))
   2002    {
   2003        uprv_free(filename);
   2004        return nullptr;
   2005    }
   2006 
   2007    if(isVerbose()){
   2008        printf(" include %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline));
   2009    }
   2010 
   2011    fullname = static_cast<char*>(uprv_malloc(state->inputdirLength + stringLength + 2));
   2012    /* test for nullptr */
   2013    if(fullname == nullptr)
   2014    {
   2015        *status = U_MEMORY_ALLOCATION_ERROR;
   2016        uprv_free(filename);
   2017        return nullptr;
   2018    }
   2019 
   2020    if(state->inputdir!=nullptr){
   2021        if (state->inputdir[state->inputdirLength - 1] != U_FILE_SEP_CHAR)
   2022        {
   2023 
   2024            uprv_strcpy(fullname, state->inputdir);
   2025 
   2026            fullname[state->inputdirLength]      = U_FILE_SEP_CHAR;
   2027            fullname[state->inputdirLength + 1] = '\0';
   2028 
   2029            uprv_strcat(fullname, filename);
   2030        }
   2031        else
   2032        {
   2033            uprv_strcpy(fullname, state->inputdir);
   2034            uprv_strcat(fullname, filename);
   2035        }
   2036    }else{
   2037        uprv_strcpy(fullname,filename);
   2038    }
   2039 
   2040    ucbuf = ucbuf_open(fullname, &cp,getShowWarning(),false,status);
   2041 
   2042    if (U_FAILURE(*status)) {
   2043        error(line, "couldn't open input file %s\n", filename);
   2044        return nullptr;
   2045    }
   2046 
   2047    uBuffer = ucbuf_getBuffer(ucbuf,&len,status);
   2048    result = string_open(state->bundle, tag, uBuffer, len, comment, status);
   2049 
   2050    ucbuf_close(ucbuf);
   2051 
   2052    uprv_free(pTarget);
   2053 
   2054    uprv_free(filename);
   2055    uprv_free(fullname);
   2056 
   2057    return result;
   2058 }
   2059 
   2060 
   2061 
   2062 
   2063 
   2064 U_STRING_DECL(k_type_string,    "string",    6);
   2065 U_STRING_DECL(k_type_binary,    "binary",    6);
   2066 U_STRING_DECL(k_type_bin,       "bin",       3);
   2067 U_STRING_DECL(k_type_table,     "table",     5);
   2068 U_STRING_DECL(k_type_table_no_fallback,     "table(nofallback)",         17);
   2069 U_STRING_DECL(k_type_int,       "int",       3);
   2070 U_STRING_DECL(k_type_integer,   "integer",   7);
   2071 U_STRING_DECL(k_type_array,     "array",     5);
   2072 U_STRING_DECL(k_type_alias,     "alias",     5);
   2073 U_STRING_DECL(k_type_intvector, "intvector", 9);
   2074 U_STRING_DECL(k_type_import,    "import",    6);
   2075 U_STRING_DECL(k_type_include,   "include",   7);
   2076 
   2077 /* Various non-standard processing plugins that create one or more special resources. */
   2078 U_STRING_DECL(k_type_plugin_uca_rules,      "process(uca_rules)",        18);
   2079 U_STRING_DECL(k_type_plugin_collation,      "process(collation)",        18);
   2080 U_STRING_DECL(k_type_plugin_transliterator, "process(transliterator)",   23);
   2081 U_STRING_DECL(k_type_plugin_dependency,     "process(dependency)",       19);
   2082 
   2083 typedef enum EResourceType
   2084 {
   2085    RESTYPE_UNKNOWN,
   2086    RESTYPE_STRING,
   2087    RESTYPE_BINARY,
   2088    RESTYPE_TABLE,
   2089    RESTYPE_TABLE_NO_FALLBACK,
   2090    RESTYPE_INTEGER,
   2091    RESTYPE_ARRAY,
   2092    RESTYPE_ALIAS,
   2093    RESTYPE_INTVECTOR,
   2094    RESTYPE_IMPORT,
   2095    RESTYPE_INCLUDE,
   2096    RESTYPE_PROCESS_UCA_RULES,
   2097    RESTYPE_PROCESS_COLLATION,
   2098    RESTYPE_PROCESS_TRANSLITERATOR,
   2099    RESTYPE_PROCESS_DEPENDENCY,
   2100    RESTYPE_RESERVED
   2101 } EResourceType;
   2102 
   2103 static struct {
   2104    const char *nameChars;   /* only used for debugging */
   2105    const char16_t *nameUChars;
   2106    ParseResourceFunction *parseFunction;
   2107 } gResourceTypes[] = {
   2108    {"Unknown", nullptr, nullptr},
   2109    {"string", k_type_string, parseString},
   2110    {"binary", k_type_binary, parseBinary},
   2111    {"table", k_type_table, parseTable},
   2112    {"table(nofallback)", k_type_table_no_fallback, nullptr}, /* parseFunction will never be called */
   2113    {"integer", k_type_integer, parseInteger},
   2114    {"array", k_type_array, parseArray},
   2115    {"alias", k_type_alias, parseAlias},
   2116    {"intvector", k_type_intvector, parseIntVector},
   2117    {"import", k_type_import, parseImport},
   2118    {"include", k_type_include, parseInclude},
   2119    {"process(uca_rules)", k_type_plugin_uca_rules, parseUCARules},
   2120    {"process(collation)", k_type_plugin_collation, nullptr /* not implemented yet */},
   2121    {"process(transliterator)", k_type_plugin_transliterator, parseTransliterator},
   2122    {"process(dependency)", k_type_plugin_dependency, parseDependency},
   2123    {"reserved", nullptr, nullptr}
   2124 };
   2125 
   2126 void initParser()
   2127 {
   2128    U_STRING_INIT(k_type_string,    "string",    6);
   2129    U_STRING_INIT(k_type_binary,    "binary",    6);
   2130    U_STRING_INIT(k_type_bin,       "bin",       3);
   2131    U_STRING_INIT(k_type_table,     "table",     5);
   2132    U_STRING_INIT(k_type_table_no_fallback,     "table(nofallback)",         17);
   2133    U_STRING_INIT(k_type_int,       "int",       3);
   2134    U_STRING_INIT(k_type_integer,   "integer",   7);
   2135    U_STRING_INIT(k_type_array,     "array",     5);
   2136    U_STRING_INIT(k_type_alias,     "alias",     5);
   2137    U_STRING_INIT(k_type_intvector, "intvector", 9);
   2138    U_STRING_INIT(k_type_import,    "import",    6);
   2139    U_STRING_INIT(k_type_include,   "include",   7);
   2140 
   2141    U_STRING_INIT(k_type_plugin_uca_rules,      "process(uca_rules)",        18);
   2142    U_STRING_INIT(k_type_plugin_collation,      "process(collation)",        18);
   2143    U_STRING_INIT(k_type_plugin_transliterator, "process(transliterator)",   23);
   2144    U_STRING_INIT(k_type_plugin_dependency,     "process(dependency)",       19);
   2145 }
   2146 
   2147 static inline UBool isTable(enum EResourceType type) {
   2148    return type == RESTYPE_TABLE || type == RESTYPE_TABLE_NO_FALLBACK;
   2149 }
   2150 
   2151 static enum EResourceType
   2152 parseResourceType(ParseState* state, UErrorCode *status)
   2153 {
   2154    struct UString        *tokenValue;
   2155    struct UString        comment;
   2156    enum   EResourceType  result = RESTYPE_UNKNOWN;
   2157    uint32_t              line=0;
   2158    ustr_init(&comment);
   2159    expect(state, TOK_STRING, &tokenValue, &comment, &line, status);
   2160 
   2161    if (U_FAILURE(*status))
   2162    {
   2163        return RESTYPE_UNKNOWN;
   2164    }
   2165 
   2166    *status = U_ZERO_ERROR;
   2167 
   2168    /* Search for normal types */
   2169    result=RESTYPE_UNKNOWN;
   2170    while ((result = static_cast<EResourceType>(result + 1)) < RESTYPE_RESERVED) {
   2171        if (u_strcmp(tokenValue->fChars, gResourceTypes[result].nameUChars) == 0) {
   2172            break;
   2173        }
   2174    }
   2175    /* Now search for the aliases */
   2176    if (u_strcmp(tokenValue->fChars, k_type_int) == 0) {
   2177        result = RESTYPE_INTEGER;
   2178    }
   2179    else if (u_strcmp(tokenValue->fChars, k_type_bin) == 0) {
   2180        result = RESTYPE_BINARY;
   2181    }
   2182    else if (result == RESTYPE_RESERVED) {
   2183        char tokenBuffer[1024];
   2184        u_austrncpy(tokenBuffer, tokenValue->fChars, sizeof(tokenBuffer));
   2185        tokenBuffer[sizeof(tokenBuffer) - 1] = 0;
   2186        *status = U_INVALID_FORMAT_ERROR;
   2187        error(line, "unknown resource type '%s'", tokenBuffer);
   2188    }
   2189 
   2190    return result;
   2191 }
   2192 
   2193 /* parse a non-top-level resource */
   2194 static struct SResource *
   2195 parseResource(ParseState* state, char *tag, const struct UString *comment, UErrorCode *status)
   2196 {
   2197    enum   ETokenType      token;
   2198    enum   EResourceType  resType = RESTYPE_UNKNOWN;
   2199    ParseResourceFunction *parseFunction = nullptr;
   2200    struct UString        *tokenValue;
   2201    uint32_t                 startline;
   2202    uint32_t                 line;
   2203 
   2204 
   2205    token = getToken(state, &tokenValue, nullptr, &startline, status);
   2206 
   2207    if(isVerbose()){
   2208        printf(" resource %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline));
   2209    }
   2210 
   2211    /* name . [ ':' type ] '{' resource '}' */
   2212    /* This function parses from the colon onwards.  If the colon is present, parse the
   2213    type then try to parse a resource of that type.  If there is no explicit type,
   2214    work it out using the lookahead tokens. */
   2215    switch (token)
   2216    {
   2217    case TOK_EOF:
   2218        *status = U_INVALID_FORMAT_ERROR;
   2219        error(startline, "Unexpected EOF encountered");
   2220        return nullptr;
   2221 
   2222    case TOK_ERROR:
   2223        *status = U_INVALID_FORMAT_ERROR;
   2224        return nullptr;
   2225 
   2226    case TOK_COLON:
   2227        resType = parseResourceType(state, status);
   2228        expect(state, TOK_OPEN_BRACE, &tokenValue, nullptr, &startline, status);
   2229 
   2230        if (U_FAILURE(*status))
   2231        {
   2232            return nullptr;
   2233        }
   2234 
   2235        break;
   2236 
   2237    case TOK_OPEN_BRACE:
   2238        break;
   2239 
   2240    default:
   2241        *status = U_INVALID_FORMAT_ERROR;
   2242        error(startline, "syntax error while reading a resource, expected '{' or ':'");
   2243        return nullptr;
   2244    }
   2245 
   2246 
   2247    if (resType == RESTYPE_UNKNOWN)
   2248    {
   2249        /* No explicit type, so try to work it out.  At this point, we've read the first '{'.
   2250        We could have any of the following:
   2251        { {         => array (nested)
   2252        { :/}       => array
   2253        { string ,  => string array
   2254 
   2255        { string {  => table
   2256 
   2257        { string :/{    => table
   2258        { string }      => string
   2259        */
   2260 
   2261        token = peekToken(state, 0, nullptr, &line, nullptr,status);
   2262 
   2263        if (U_FAILURE(*status))
   2264        {
   2265            return nullptr;
   2266        }
   2267 
   2268        if (token == TOK_OPEN_BRACE || token == TOK_COLON ||token ==TOK_CLOSE_BRACE )
   2269        {
   2270            resType = RESTYPE_ARRAY;
   2271        }
   2272        else if (token == TOK_STRING)
   2273        {
   2274            token = peekToken(state, 1, nullptr, &line, nullptr, status);
   2275 
   2276            if (U_FAILURE(*status))
   2277            {
   2278                return nullptr;
   2279            }
   2280 
   2281            switch (token)
   2282            {
   2283            case TOK_COMMA:         resType = RESTYPE_ARRAY;  break;
   2284            case TOK_OPEN_BRACE:    resType = RESTYPE_TABLE;  break;
   2285            case TOK_CLOSE_BRACE:   resType = RESTYPE_STRING; break;
   2286            case TOK_COLON:         resType = RESTYPE_TABLE;  break;
   2287            default:
   2288                *status = U_INVALID_FORMAT_ERROR;
   2289                error(line, "Unexpected token after string, expected ',', '{' or '}'");
   2290                return nullptr;
   2291            }
   2292        }
   2293        else
   2294        {
   2295            *status = U_INVALID_FORMAT_ERROR;
   2296            error(line, "Unexpected token after '{'");
   2297            return nullptr;
   2298        }
   2299 
   2300        /* printf("Type guessed as %s\n", resourceNames[resType]); */
   2301    } else if(resType == RESTYPE_TABLE_NO_FALLBACK) {
   2302        *status = U_INVALID_FORMAT_ERROR;
   2303        error(startline, "error: %s resource type not valid except on top bundle level", gResourceTypes[resType].nameChars);
   2304        return nullptr;
   2305    }
   2306 
   2307 
   2308    /* We should now know what we need to parse next, so call the appropriate parser
   2309    function and return. */
   2310    parseFunction = gResourceTypes[resType].parseFunction;
   2311    if (parseFunction != nullptr) {
   2312        return parseFunction(state, tag, startline, comment, status);
   2313    }
   2314    else {
   2315        *status = U_INTERNAL_PROGRAM_ERROR;
   2316        error(startline, "internal error: %s resource type found and not handled", gResourceTypes[resType].nameChars);
   2317    }
   2318 
   2319    return nullptr;
   2320 }
   2321 
   2322 /* parse the top-level resource */
   2323 struct SRBRoot *
   2324 parse(UCHARBUF *buf, const char *inputDir, const char *outputDir, const char *filename,
   2325      UBool makeBinaryCollation, UBool omitCollationRules, UBool icu4xMode, UErrorCode *status)
   2326 {
   2327    struct UString    *tokenValue;
   2328    struct UString    comment;
   2329    uint32_t           line;
   2330    enum EResourceType bundleType;
   2331    enum ETokenType    token;
   2332    ParseState state;
   2333    uint32_t i;
   2334 
   2335 
   2336    for (i = 0; i < MAX_LOOKAHEAD + 1; i++)
   2337    {
   2338        ustr_init(&state.lookahead[i].value);
   2339        ustr_init(&state.lookahead[i].comment);
   2340    }
   2341 
   2342    initLookahead(&state, buf, status);
   2343 
   2344    state.inputdir       = inputDir;
   2345    state.inputdirLength = state.inputdir != nullptr ? static_cast<uint32_t>(uprv_strlen(state.inputdir)) : 0;
   2346    state.outputdir       = outputDir;
   2347    state.outputdirLength = state.outputdir != nullptr ? static_cast<uint32_t>(uprv_strlen(state.outputdir)) : 0;
   2348    state.filename = filename;
   2349    state.makeBinaryCollation = makeBinaryCollation;
   2350    state.omitCollationRules = omitCollationRules;
   2351    state.icu4xMode = icu4xMode;
   2352 
   2353    ustr_init(&comment);
   2354    expect(&state, TOK_STRING, &tokenValue, &comment, nullptr, status);
   2355 
   2356    state.bundle = new SRBRoot(&comment, false, *status);
   2357 
   2358    if (state.bundle == nullptr || U_FAILURE(*status))
   2359    {
   2360        delete state.bundle;
   2361 
   2362        return nullptr;
   2363    }
   2364 
   2365 
   2366    state.bundle->setLocale(tokenValue->fChars, *status);
   2367 
   2368    /* The following code is to make Empty bundle work no matter with :table specifer or not */
   2369    token = getToken(&state, nullptr, nullptr, &line, status);
   2370    if(token==TOK_COLON) {
   2371        *status=U_ZERO_ERROR;
   2372        bundleType=parseResourceType(&state, status);
   2373 
   2374        if(isTable(bundleType))
   2375        {
   2376            expect(&state, TOK_OPEN_BRACE, nullptr, nullptr, &line, status);
   2377        }
   2378        else
   2379        {
   2380            *status=U_PARSE_ERROR;
   2381             error(line, "parse error. Stopped parsing with %s", u_errorName(*status));
   2382        }
   2383    }
   2384    else
   2385    {
   2386        /* not a colon */
   2387        if(token==TOK_OPEN_BRACE)
   2388        {
   2389            *status=U_ZERO_ERROR;
   2390            bundleType=RESTYPE_TABLE;
   2391        }
   2392        else
   2393        {
   2394            /* neither colon nor open brace */
   2395            *status=U_PARSE_ERROR;
   2396            bundleType=RESTYPE_UNKNOWN;
   2397            error(line, "parse error, did not find open-brace '{' or colon ':', stopped with %s", u_errorName(*status));
   2398        }
   2399    }
   2400 
   2401    if (U_FAILURE(*status))
   2402    {
   2403        delete state.bundle;
   2404        return nullptr;
   2405    }
   2406 
   2407    if(bundleType==RESTYPE_TABLE_NO_FALLBACK) {
   2408        /*
   2409         * Parse a top-level table with the table(nofallback) declaration.
   2410         * This is the same as a regular table, but also sets the
   2411         * URES_ATT_NO_FALLBACK flag in indexes[URES_INDEX_ATTRIBUTES] .
   2412         */
   2413        state.bundle->fNoFallback=true;
   2414    }
   2415    /* top-level tables need not handle special table names like "collations" */
   2416    assert(!state.bundle->fIsPoolBundle);
   2417    assert(state.bundle->fRoot->fType == URES_TABLE);
   2418    TableResource *rootTable = static_cast<TableResource *>(state.bundle->fRoot);
   2419    realParseTable(&state, rootTable, nullptr, line, status);
   2420    if(dependencyArray!=nullptr){
   2421        rootTable->add(dependencyArray, 0, *status);
   2422        dependencyArray = nullptr;
   2423    }
   2424   if (U_FAILURE(*status))
   2425    {
   2426        delete state.bundle;
   2427        res_close(dependencyArray);
   2428        return nullptr;
   2429    }
   2430 
   2431    if (getToken(&state, nullptr, nullptr, &line, status) != TOK_EOF)
   2432    {
   2433        warning(line, "extraneous text after resource bundle (perhaps unmatched braces)");
   2434        if(isStrict()){
   2435            *status = U_INVALID_FORMAT_ERROR;
   2436            return nullptr;
   2437        }
   2438    }
   2439 
   2440    cleanupLookahead(&state);
   2441    ustr_deinit(&comment);
   2442    return state.bundle;
   2443 }