tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

messageformat2_parser.cpp (66568B)


      1 // © 2024 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 
      4 #include "unicode/utypes.h"
      5 
      6 #if !UCONFIG_NO_NORMALIZATION
      7 
      8 #if !UCONFIG_NO_FORMATTING
      9 
     10 #if !UCONFIG_NO_MF2
     11 
     12 #include "unicode/uniset.h"
     13 #include "messageformat2_errors.h"
     14 #include "messageformat2_macros.h"
     15 #include "messageformat2_parser.h"
     16 #include "ucln_in.h"
     17 #include "umutex.h"
     18 #include "uvector.h" // U_ASSERT
     19 
     20 U_NAMESPACE_BEGIN
     21 
     22 namespace message2 {
     23 
     24 using namespace pluralimpl;
     25 
     26 using namespace data_model;
     27 
     28 /*
     29    The `ERROR()` macro sets a syntax error in the context
     30    and sets the offset in `parseError` to `index`. It does not alter control flow.
     31 */
     32 #define ERROR(errorCode)                                                                                \
     33    if (!errors.hasSyntaxError()) {                                                                     \
     34        setParseError(parseError, index);                                                               \
     35        errors.addSyntaxError(errorCode);                                                               \
     36    }
     37 
     38 #define ERROR_AT(errorCode, i)                                                                          \
     39    if (!errors.hasSyntaxError()) {                                                                     \
     40        setParseError(parseError, i);                                                                   \
     41        errors.addSyntaxError(errorCode);                                                               \
     42    }
     43 
     44 // Increments the line number and updates the "characters seen before
     45 // current line" count in `parseError`, iff `peek()` is a newline
     46 void Parser::maybeAdvanceLine() {
     47    if (peek() == LF) {
     48        parseError.line++;
     49        // add 1 to index to get the number of characters seen so far
     50        // (including the newline)
     51        parseError.lengthBeforeCurrentLine = index + 1;
     52    }
     53 }
     54 
     55 /*
     56    Signals an error and returns either if `parseError` already denotes an
     57    error, or `index` is out of bounds for the string `source`
     58 */
     59 #define CHECK_BOUNDS(errorCode)                                                            \
     60    if (!inBounds()) {                                                                     \
     61        ERROR(errorCode);                                                                  \
     62        return;                                                                            \
     63    }
     64 #define CHECK_BOUNDS_1(errorCode)                                                          \
     65    if (!inBounds(1)) {                                                                    \
     66        ERROR_AT(errorCode, index + 1);                                                    \
     67        return;                                                                            \
     68    }
     69 
     70 // -------------------------------------
     71 // Helper functions
     72 
     73 static void copyContext(const UChar in[U_PARSE_CONTEXT_LEN], UChar out[U_PARSE_CONTEXT_LEN]) {
     74    for (int32_t i = 0; i < U_PARSE_CONTEXT_LEN; i++) {
     75        out[i] = in[i];
     76        if (in[i] == '\0') {
     77            break;
     78        }
     79    }
     80 }
     81 
     82 /* static */ void Parser::translateParseError(const MessageParseError &messageParseError, UParseError &parseError) {
     83    parseError.line = messageParseError.line;
     84    parseError.offset = messageParseError.offset;
     85    copyContext(messageParseError.preContext, parseError.preContext);
     86    copyContext(messageParseError.postContext, parseError.postContext);
     87 }
     88 
     89 /* static */ void Parser::setParseError(MessageParseError &parseError, uint32_t index) {
     90    // Translate absolute to relative offset
     91    parseError.offset = index                               // Start with total number of characters seen
     92                      - parseError.lengthBeforeCurrentLine; // Subtract all characters before the current line
     93    // TODO: Fill this in with actual pre and post-context
     94    parseError.preContext[0] = 0;
     95    parseError.postContext[0] = 0;
     96 }
     97 
     98 // -------------------------------------
     99 // Initialization of UnicodeSets
    100 
    101 namespace unisets {
    102 
    103 UnicodeSet* gUnicodeSets[unisets::UNISETS_KEY_COUNT] = {};
    104 
    105 inline UnicodeSet* getImpl(Key key) {
    106    return gUnicodeSets[key];
    107 }
    108 
    109 icu::UInitOnce gMF2ParseUniSetsInitOnce {};
    110 }
    111 
    112 UnicodeSet* initContentChars(UErrorCode& status) {
    113    if (U_FAILURE(status)) {
    114        return nullptr;
    115    }
    116 
    117    UnicodeSet* result = new UnicodeSet(0x0001, 0x0008); // Omit NULL, HTAB and LF
    118    if (result == nullptr) {
    119        status = U_MEMORY_ALLOCATION_ERROR;
    120        return nullptr;
    121    }
    122    result->add(0x000B, 0x000C); // Omit CR
    123    result->add(0x000E, 0x001F); // Omit SP
    124    result->add(0x0021, 0x002D); // Omit '.'
    125    result->add(0x002F, 0x003F); // Omit '@'
    126    result->add(0x0041, 0x005B); // Omit '\'
    127    result->add(0x005D, 0x007A); // Omit { | }
    128    result->add(0x007E, 0x2FFF); // Omit IDEOGRAPHIC_SPACE
    129    result->add(0x3001, 0x10FFFF); // Allowing surrogates is intentional
    130    result->freeze();
    131    return result;
    132 }
    133 
    134 UnicodeSet* initWhitespace(UErrorCode& status) {
    135    if (U_FAILURE(status)) {
    136        return nullptr;
    137    }
    138 
    139    UnicodeSet* result = new UnicodeSet();
    140    if (result == nullptr) {
    141        status = U_MEMORY_ALLOCATION_ERROR;
    142        return nullptr;
    143    }
    144    result->add(SPACE);
    145    result->add(HTAB);
    146    result->add(CR);
    147    result->add(LF);
    148    result->add(IDEOGRAPHIC_SPACE);
    149    result->freeze();
    150    return result;
    151 }
    152 
    153 UnicodeSet* initBidiControls(UErrorCode& status) {
    154    UnicodeSet* result = new UnicodeSet(UnicodeString("[\\u061C]"), status);
    155    if (U_FAILURE(status)) {
    156        return nullptr;
    157    }
    158    result->add(0x200E, 0x200F);
    159    result->add(0x2066, 0x2069);
    160    result->freeze();
    161    return result;
    162 }
    163 
    164 UnicodeSet* initAlpha(UErrorCode& status) {
    165    UnicodeSet* result = new UnicodeSet(UnicodeString("[:letter:]"), status);
    166    if (U_FAILURE(status)) {
    167        return nullptr;
    168    }
    169    result->freeze();
    170    return result;
    171 }
    172 
    173 UnicodeSet* initDigits(UErrorCode& status) {
    174    UnicodeSet* result = new UnicodeSet(UnicodeString("[:number:]"), status);
    175    if (U_FAILURE(status)) {
    176        return nullptr;
    177    }
    178    result->freeze();
    179    return result;
    180 }
    181 
    182 UnicodeSet* initNameStartChars(UErrorCode& status) {
    183    if (U_FAILURE(status)) {
    184        return nullptr;
    185    }
    186 
    187    UnicodeSet* isAlpha = unisets::gUnicodeSets[unisets::ALPHA] = initAlpha(status);
    188    if (U_FAILURE(status)) {
    189        return nullptr;
    190    }
    191    UnicodeSet* result = new UnicodeSet();
    192    if (result == nullptr) {
    193        status = U_MEMORY_ALLOCATION_ERROR;
    194        return nullptr;
    195    };
    196 
    197    result->addAll(*isAlpha);
    198    result->add(0x002B);
    199    result->add(0x005F);
    200    result->add(0x00A1, 0x061B);
    201    result->add(0x061D, 0x167F);
    202    result->add(0x1681, 0x1FFF);
    203    result->add(0x200B, 0x200D);
    204    result->add(0x2010, 0x2027);
    205    result->add(0x2030, 0x205E);
    206    result->add(0x2060, 0x2065);
    207    result->add(0x206A, 0x2FFF);
    208    result->add(0x3001, 0xD7FF);
    209    result->add(0xE000, 0xFDCF);
    210    result->add(0xFDF0, 0xFFFD);
    211    result->add(0x10000, 0x1FFFD);
    212    result->add(0x20000, 0x2FFFD);
    213    result->add(0x30000, 0x3FFFD);
    214    result->add(0x40000, 0x4FFFD);
    215    result->add(0x50000, 0x5FFFD);
    216    result->add(0x60000, 0x6FFFD);
    217    result->add(0x70000, 0x7FFFD);
    218    result->add(0x80000, 0x8FFFD);
    219    result->add(0x90000, 0x9FFFD);
    220    result->add(0xA0000, 0xAFFFD);
    221    result->add(0xB0000, 0xBFFFD);
    222    result->add(0xC0000, 0xCFFFD);
    223    result->add(0xD0000, 0xDFFFD);
    224    result->add(0xE0000, 0xEFFFD);
    225    result->add(0xF0000, 0xFFFFD);
    226    result->add(0x100000, 0x10FFFD);
    227    result->freeze();
    228    return result;
    229 }
    230 
    231 UnicodeSet* initNameChars(UErrorCode& status) {
    232    if (U_FAILURE(status)) {
    233        return nullptr;
    234    }
    235 
    236    UnicodeSet* nameStart = unisets::gUnicodeSets[unisets::NAME_START] = initNameStartChars(status);
    237    UnicodeSet* digit = unisets::gUnicodeSets[unisets::DIGIT] = initDigits(status);
    238    if (U_FAILURE(status)) {
    239        return nullptr;
    240    }
    241    UnicodeSet* result = new UnicodeSet();
    242    if (result == nullptr) {
    243        status = U_MEMORY_ALLOCATION_ERROR;
    244        return nullptr;
    245    };
    246    result->addAll(*nameStart);
    247    result->addAll(*digit);
    248    result->add(HYPHEN);
    249    result->add(PERIOD);
    250    result->freeze();
    251    return result;
    252 }
    253 
    254 UnicodeSet* initTextChars(UErrorCode& status) {
    255    if (U_FAILURE(status)) {
    256        return nullptr;
    257    }
    258 
    259    UnicodeSet* content = unisets::gUnicodeSets[unisets::CONTENT] = initContentChars(status);
    260    UnicodeSet* whitespace = unisets::gUnicodeSets[unisets::WHITESPACE] = initWhitespace(status);
    261    if (U_FAILURE(status)) {
    262        return nullptr;
    263    }
    264    UnicodeSet* result = new UnicodeSet();
    265    if (result == nullptr) {
    266        status = U_MEMORY_ALLOCATION_ERROR;
    267        return nullptr;
    268    };
    269    result->addAll(*content);
    270    result->addAll(*whitespace);
    271    result->add(PERIOD);
    272    result->add(AT);
    273    result->add(PIPE);
    274    result->freeze();
    275    return result;
    276 }
    277 
    278 UnicodeSet* initQuotedChars(UErrorCode& status) {
    279    if (U_FAILURE(status)) {
    280        return nullptr;
    281    }
    282 
    283    unisets::gUnicodeSets[unisets::TEXT] = initTextChars(status);
    284    if (U_FAILURE(status)) {
    285        return nullptr;
    286    }
    287    UnicodeSet* result = new UnicodeSet();
    288    if (result == nullptr) {
    289        status = U_MEMORY_ALLOCATION_ERROR;
    290        return nullptr;
    291    };
    292    // content and whitespace were initialized by `initTextChars()`
    293    UnicodeSet* content = unisets::getImpl(unisets::CONTENT);
    294    if (content == nullptr) {
    295        status = U_MEMORY_ALLOCATION_ERROR;
    296        return nullptr;
    297    }
    298    result->addAll(*content);
    299    UnicodeSet* whitespace = unisets::getImpl(unisets::WHITESPACE);
    300    if (whitespace == nullptr) {
    301        status = U_MEMORY_ALLOCATION_ERROR;
    302        return nullptr;
    303    }
    304    result->addAll(*whitespace);
    305    result->add(PERIOD);
    306    result->add(AT);
    307    result->add(LEFT_CURLY_BRACE);
    308    result->add(RIGHT_CURLY_BRACE);
    309    result->freeze();
    310    return result;
    311 }
    312 
    313 UnicodeSet* initEscapableChars(UErrorCode& status) {
    314    if (U_FAILURE(status)) {
    315        return nullptr;
    316    }
    317 
    318    UnicodeSet* result = new UnicodeSet();
    319    if (result == nullptr) {
    320        status = U_MEMORY_ALLOCATION_ERROR;
    321        return nullptr;
    322    }
    323    result->add(PIPE);
    324    result->add(BACKSLASH);
    325    result->add(LEFT_CURLY_BRACE);
    326    result->add(RIGHT_CURLY_BRACE);
    327    result->freeze();
    328    return result;
    329 }
    330 
    331 namespace unisets {
    332 
    333 UBool U_CALLCONV cleanupMF2ParseUniSets() {
    334    for (int32_t i = 0; i < UNISETS_KEY_COUNT; i++) {
    335        delete gUnicodeSets[i];
    336        gUnicodeSets[i] = nullptr;
    337    }
    338    gMF2ParseUniSetsInitOnce.reset();
    339    return true;
    340 }
    341 
    342 void U_CALLCONV initMF2ParseUniSets(UErrorCode& status) {
    343    ucln_i18n_registerCleanup(UCLN_I18N_MF2_UNISETS, cleanupMF2ParseUniSets);
    344    /*
    345      Each of the init functions initializes the UnicodeSets
    346      that it depends on.
    347 
    348      initBidiControls (no dependencies)
    349 
    350      initEscapableChars (no dependencies)
    351 
    352      initNameChars depends on
    353         initDigits
    354         initNameStartChars depends on
    355           initAlpha
    356 
    357      initQuotedChars depends on
    358         initTextChars depends on
    359            initContentChars
    360            initWhitespace
    361     */
    362    gUnicodeSets[unisets::BIDI] = initBidiControls(status);
    363    gUnicodeSets[unisets::NAME_CHAR] = initNameChars(status);
    364    gUnicodeSets[unisets::QUOTED] = initQuotedChars(status);
    365    gUnicodeSets[unisets::ESCAPABLE] = initEscapableChars(status);
    366 
    367    if (U_FAILURE(status)) {
    368        cleanupMF2ParseUniSets();
    369    }
    370 }
    371 
    372 const UnicodeSet* get(Key key, UErrorCode& status) {
    373    umtx_initOnce(gMF2ParseUniSetsInitOnce, &initMF2ParseUniSets, status);
    374    if (U_FAILURE(status)) {
    375        return nullptr;
    376    }
    377    UnicodeSet* result = getImpl(key);
    378    if (result == nullptr) {
    379        status = U_MEMORY_ALLOCATION_ERROR;
    380    }
    381    return result;
    382 }
    383 
    384 }
    385 
    386 // -------------------------------------
    387 // Predicates
    388 
    389 /*
    390  The following helper predicates should exactly match nonterminals in the MessageFormat 2 grammar:
    391 
    392  `isContentChar()`   : `content-char`
    393  `isTextChar()`      : `text-char`
    394  `isAlpha()`         : `ALPHA`
    395  `isDigit()`         : `DIGIT`
    396  `isNameStart()`     : `name-start`
    397  `isNameChar()`      : `name-char`
    398  `isUnquotedStart()` : `unquoted-start`
    399  `isQuotedChar()`    : `quoted-char`
    400  `isWhitespace()`    : `s`
    401 */
    402 
    403 bool Parser::isContentChar(UChar32 c) const {
    404    return contentChars->contains(c);
    405 }
    406 
    407 // See `bidi` in the MF2 grammar
    408 bool Parser::isBidiControl(UChar32 c) const {
    409    return bidiControlChars->contains(c);
    410 }
    411 
    412 // See `ws` in the MessageFormat 2 grammar
    413 bool Parser::isWhitespace(UChar32 c) const {
    414    return whitespaceChars->contains(c);
    415 }
    416 
    417 bool Parser::isTextChar(UChar32 c) const {
    418    return textChars->contains(c);
    419 }
    420 
    421 bool Parser::isAlpha(UChar32 c) const {
    422    return alphaChars->contains(c);
    423 }
    424 
    425 bool Parser::isDigit(UChar32 c) const {
    426    return digitChars->contains(c);
    427 }
    428 
    429 bool Parser::isNameStart(UChar32 c) const {
    430    return nameStartChars->contains(c);
    431 }
    432 
    433 bool Parser::isNameChar(UChar32 c) const {
    434    return nameChars->contains(c);
    435 }
    436 
    437 bool Parser::isUnquotedStart(UChar32 c) const {
    438    return isNameChar(c);
    439 }
    440 
    441 bool Parser::isQuotedChar(UChar32 c) const {
    442    return quotedChars->contains(c);
    443 }
    444 
    445 bool Parser::isEscapableChar(UChar32 c) const {
    446    return escapableChars->contains(c);
    447 }
    448 
    449 // Returns true iff `c` can begin a `function` nonterminal
    450 static bool isFunctionStart(UChar32 c) {
    451    switch (c) {
    452    case COLON: {
    453        return true;
    454    }
    455    default: {
    456        return false;
    457    }
    458    }
    459 }
    460 
    461 // Returns true iff `c` can begin an `annotation` nonterminal
    462 static bool isAnnotationStart(UChar32 c) {
    463    return isFunctionStart(c);
    464 }
    465 
    466 // Returns true iff `c` can begin a `literal` nonterminal
    467 bool Parser::isLiteralStart(UChar32 c) const {
    468    return (c == PIPE || isNameStart(c) || c == HYPHEN || isDigit(c));
    469 }
    470 
    471 // Returns true iff `c` can begin a `key` nonterminal
    472 bool Parser::isKeyStart(UChar32 c) const {
    473    return (c == ASTERISK || isLiteralStart(c));
    474 }
    475 
    476 bool Parser::isDeclarationStart() {
    477    return (peek() == ID_LOCAL[0]
    478            && inBounds(1)
    479            && peek(1) == ID_LOCAL[1])
    480        || (peek() == ID_INPUT[0]
    481            && inBounds(1)
    482            && peek(1) == ID_INPUT[1]);
    483 }
    484 
    485 // -------------------------------------
    486 // Parsing functions
    487 
    488 
    489 /*
    490  TODO: Since handling the whitespace ambiguities needs to be repeated
    491  in several different places and is hard to factor out,
    492  it probably would be better to replace the parser with a lexer + parser
    493  to separate tokenizing from parsing, which would simplify the code significantly.
    494  This has the disadvantage that there is no token grammar for MessageFormat,
    495  so one would have to be invented that isn't a component of the spec.
    496 */
    497 
    498 /*
    499    This is a recursive-descent scannerless parser that,
    500    with a few exceptions, uses 1 character of lookahead.
    501 
    502    This may not be an exhaustive list, as the additions of attributes and reserved
    503    statements introduced several new ambiguities.
    504 
    505 All but three of the exceptions involve ambiguities about the meaning of whitespace.
    506 One ambiguity not involving whitespace is:
    507 identifier -> namespace ":" name
    508 vs.
    509 identifier -> name
    510 
    511 `namespace` and `name` can't be distinguished without arbitrary lookahead.
    512 (For how this is handled, see parseIdentifier())
    513 
    514 The second ambiguity not involving whitespace is:
    515 complex-message -> *(declaration[s]) complex-body
    516                -> declaration *(declaration[s]) complex-body
    517                -> declaration complex-body
    518                -> reserved-statement complex-body
    519                -> .foo {$x} .match // ...
    520 When processing the '.', arbitrary lookahead is required to distinguish the
    521 arbitrary-length unsupported keyword from `.match`.
    522 (For how this is handled, see parseDeclarations()).
    523 
    524 The third ambiguity not involving whitespace is:
    525 complex-message -> *(declaration [s]) complex-body
    526                -> reserved-statement *(declaration [s]) complex-body
    527                -> reserved-statement complex-body
    528                -> reserved-statement quotedPattern
    529                -> reserved-keyword [s reserved-body] 1*([s] expression) quoted-pattern
    530                -> reserved-keyword expression quoted-pattern
    531 Example: .foo {1} {{1}}
    532 
    533 Without lookahead, the opening '{' of the quoted pattern can't be distinguished
    534 from the opening '{' of another expression in the unsupported statement.
    535 (Though this only requires 1 character of lookahead.)
    536 
    537 Otherwise:
    538 
    539 There are at least seven ambiguities in the grammar that can't be resolved with finite
    540 lookahead (since whitespace sequences can be arbitrarily long). They are resolved
    541 with a form of backtracking (early exit). No state needs to be saved/restored
    542 since whitespace doesn't affect the shape of the resulting parse tree, so it's
    543 not true backtracking.
    544 
    545 In addition, the grammar has been refactored
    546 in a semantics-preserving way in some cases to make the code easier to structure.
    547 
    548 First: variant = when 1*(s key) [s] pattern
    549   Example: when k     {a}
    550   When reading the first space after 'k', it's ambiguous whether it's the
    551   required space before another key, or the optional space before `pattern`.
    552 (See comments in parseNonEmptyKeys())
    553 
    554 Second: expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
    555        annotation = (function *(s option)) / reserved
    556   Example: {:f    }
    557   When reading the first space after 'f', it's ambiguous whether it's the
    558   required space before an option, or the optional trailing space after an options list
    559   (in this case, the options list is empty).
    560 (See comments in parseOptions() -- handling this case also meant it was easier to base
    561  the code on a slightly refactored grammar, which should be semantically equivalent.)
    562 
    563 Third: expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
    564        annotation = (function *(s option)) / reserved
    565   Example: {@a }
    566   Similar to the previous case; see comments in parseReserved()
    567 
    568 Fourth: expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
    569   Example: {|foo|   }
    570   When reading the first space after the '|', it's ambiguous whether it's the required
    571   space before an annotation, or the optional trailing space before the '}'.
    572  (See comments in parseLiteralOrVariableWithAnnotation(); handling this case relies on
    573  the same grammar refactoring as the second exception.)
    574 
    575    Most functions match a non-terminal in the grammar, except as explained
    576    in comments.
    577 
    578 Fifth: matcher = match-statement 1*([s] variant)
    579               -> match 1 *([s] selector) 1*([s] variant)
    580    Example: match {42} * {{_}}
    581 When reading the space after the first '}', it's unclear whether
    582 it's the optional space before another selector, or the optional space
    583 before a variant.
    584 
    585 Sixth: annotation-expression = "{" [s] annotation *(s attribute) [s] "}"
    586       -> "{" [s] function *(s attribute) [s] "}"
    587       -> "{" [s] ":" identifier *(s option) *(s attribute) [s] "}"
    588       -> "{" [s] ":" identifier s attribute *(s attribute) [s] "}"
    589 
    590     Example: {:func @foo}
    591 (Note: the same ambiguity is present with variable-expression and literal-expression)
    592 
    593 Seventh:
    594 
    595 
    596 When parsing the space, it's unclear whether it's the optional space before an
    597 option, or the optional space before an attribute.
    598 
    599 Unless otherwise noted in a comment, all helper functions that take
    600    a `source` string, an `index` unsigned int, and an `errorCode` `UErrorCode`
    601    have the precondition:
    602      `index` < `len()`
    603    and the postcondition:
    604      `U_FAILURE(errorCode)` || `index < `len()`
    605 */
    606 
    607 /*
    608  No pre, no post.
    609  A message may end with whitespace, so `index` may equal `len()` on exit.
    610 */
    611 void Parser::parseRequiredWS(UErrorCode& errorCode) {
    612    bool sawWhitespace = false;
    613 
    614    // The loop exits either when we consume all the input,
    615    // or when we see a non-whitespace character.
    616    while (true) {
    617        // Check if all input has been consumed
    618        if (!inBounds()) {
    619            // If whitespace isn't required -- or if we saw it already --
    620            // then the caller is responsible for checking this case and
    621            // setting an error if necessary.
    622            if (sawWhitespace) {
    623                // Not an error.
    624                return;
    625            }
    626            // Otherwise, whitespace is required; the end of the input has
    627            // been reached without whitespace. This is an error.
    628            ERROR(errorCode);
    629            return;
    630        }
    631 
    632        // Input remains; process the next character if it's whitespace,
    633        // exit the loop otherwise
    634        if (isWhitespace(peek())) {
    635            sawWhitespace = true;
    636            // Increment line number in parse error if we consume a newline
    637            maybeAdvanceLine();
    638            next();
    639        } else {
    640            break;
    641        }
    642    }
    643 
    644    if (!sawWhitespace) {
    645        ERROR(errorCode);
    646    }
    647 }
    648 
    649 void Parser::parseOptionalBidi() {
    650    while (true) {
    651        if (!inBounds()) {
    652            return;
    653        }
    654        if (isBidiControl(peek())) {
    655            next();
    656        } else {
    657            break;
    658        }
    659    }
    660 }
    661 
    662 /*
    663  No pre, no post, because a message may end with whitespace
    664  Matches `s` in the MF2 grammar
    665 */
    666 void Parser::parseRequiredWhitespace(UErrorCode& errorCode) {
    667    parseOptionalBidi();
    668    parseRequiredWS(errorCode);
    669    parseOptionalWhitespace();
    670    normalizedInput += SPACE;
    671 }
    672 
    673 /*
    674  No pre, no post, for the same reason as `parseWhitespaceMaybeRequired()`.
    675 */
    676 void Parser::parseOptionalWhitespace() {
    677    while (true) {
    678        if (!inBounds()) {
    679            return;
    680        }
    681        auto cp = peek();
    682        if (isWhitespace(cp) || isBidiControl(cp)) {
    683            maybeAdvanceLine();
    684            next();
    685        } else {
    686            break;
    687        }
    688    }
    689 }
    690 
    691 // Consumes a single character, signaling an error if `peek()` != `c`
    692 // No postcondition -- a message can end with a '}' token
    693 void Parser::parseToken(UChar32 c, UErrorCode& errorCode) {
    694    CHECK_BOUNDS(errorCode);
    695 
    696    if (peek() == c) {
    697        next();
    698        normalizedInput += c;
    699        return;
    700    }
    701    // Next character didn't match -- error out
    702    ERROR(errorCode);
    703 }
    704 
    705 /*
    706   Consumes a fixed-length token, signaling an error if the token isn't a prefix of
    707   the string beginning at `peek()`
    708   No postcondition -- a message can end with a '}' token
    709 */
    710 void Parser::parseToken(const std::u16string_view& token, UErrorCode& errorCode) {
    711    U_ASSERT(inBounds());
    712 
    713    int32_t tokenPos = 0;
    714    while (tokenPos < static_cast<int32_t>(token.length())) {
    715        if (peek() != token[tokenPos]) {
    716            ERROR(errorCode);
    717            return;
    718        }
    719        normalizedInput += token[tokenPos];
    720        next();
    721        tokenPos++;
    722    }
    723 }
    724 
    725 /*
    726   Consumes optional whitespace, possibly advancing `index` to `index'`,
    727   then consumes a fixed-length token (signaling an error if the token isn't a prefix of
    728   the string beginning at `source[index']`),
    729   then consumes optional whitespace again
    730 */
    731 void Parser::parseTokenWithWhitespace(const std::u16string_view& token, UErrorCode& errorCode) {
    732    // No need for error check or bounds check before parseOptionalWhitespace
    733    parseOptionalWhitespace();
    734    // Establish precondition
    735    CHECK_BOUNDS(errorCode);
    736    parseToken(token, errorCode);
    737    parseOptionalWhitespace();
    738    // Guarantee postcondition
    739    CHECK_BOUNDS(errorCode);
    740 }
    741 
    742 /*
    743   Consumes optional whitespace, possibly advancing `index` to `index'`,
    744   then consumes a single character (signaling an error if it doesn't match
    745   `source[index']`),
    746   then consumes optional whitespace again
    747 */
    748 void Parser::parseTokenWithWhitespace(UChar32 c, UErrorCode& errorCode) {
    749    // No need for error check or bounds check before parseOptionalWhitespace()
    750    parseOptionalWhitespace();
    751    // Establish precondition
    752    CHECK_BOUNDS(errorCode);
    753    parseToken(c, errorCode);
    754    parseOptionalWhitespace();
    755    // Guarantee postcondition
    756    CHECK_BOUNDS(errorCode);
    757 }
    758 
    759 /*
    760  Consumes a possibly-empty sequence of name-chars. Appends to `str`
    761  and returns `str`.
    762 */
    763 UnicodeString Parser::parseNameChars(UnicodeString& str, UErrorCode& errorCode) {
    764    if (U_FAILURE(errorCode)) {
    765        return {};
    766    }
    767 
    768    while (isNameChar(peek())) {
    769        UChar32 c = peek();
    770        str += c;
    771        normalizedInput += c;
    772        next();
    773        if (!inBounds()) {
    774            ERROR(errorCode);
    775            break;
    776        }
    777    }
    778 
    779    return str;
    780 }
    781 
    782 /*
    783  Consumes a non-empty sequence of `name-char`s, the first of which is
    784  also a `name-start`.
    785  that begins with a character `start` such that `isNameStart(start)`.
    786 
    787  Returns this sequence.
    788 
    789  (Matches the `name` nonterminal in the grammar.)
    790 */
    791 UnicodeString Parser::parseName(UErrorCode& errorCode) {
    792    UnicodeString name;
    793 
    794    U_ASSERT(inBounds());
    795 
    796    if (!(isNameStart(peek()) || isBidiControl(peek()))) {
    797        ERROR(errorCode);
    798        return name;
    799    }
    800 
    801    // name       = [bidi] name-start *name-char [bidi]
    802 
    803    // [bidi]
    804    parseOptionalBidi();
    805 
    806    // name-start *name-char
    807    parseNameChars(name, errorCode);
    808 
    809    // [bidi]
    810    parseOptionalBidi();
    811 
    812    return name;
    813 }
    814 
    815 /*
    816  Consumes a '$' followed by a `name`, returning a VariableName
    817  with `name` as its name
    818 
    819  (Matches the `variable` nonterminal in the grammar.)
    820 */
    821 VariableName Parser::parseVariableName(UErrorCode& errorCode) {
    822    VariableName result;
    823 
    824    U_ASSERT(inBounds());
    825 
    826    parseToken(DOLLAR, errorCode);
    827    if (!inBounds()) {
    828        ERROR(errorCode);
    829        return result;
    830    }
    831    return VariableName(parseName(errorCode));
    832 }
    833 
    834 /*
    835  Corresponds to the `identifier` nonterminal in the grammar
    836 */
    837 UnicodeString Parser::parseIdentifier(UErrorCode& errorCode) {
    838    U_ASSERT(inBounds());
    839 
    840    UnicodeString result;
    841    // The following is a hack to get around ambiguity in the grammar:
    842    // identifier -> namespace ":" name
    843    // vs.
    844    // identifier -> name
    845    // can't be distinguished without arbitrary lookahead.
    846    // Instead, we treat the production as:
    847    // identifier -> namespace *(":"name)
    848    // and then check for multiple colons.
    849 
    850    // Parse namespace
    851    result += parseName(errorCode);
    852    int32_t firstColon = -1;
    853    while (inBounds() && peek() == COLON) {
    854        // Parse ':' separator
    855        if (firstColon == -1) {
    856            firstColon = index;
    857        }
    858        parseToken(COLON, errorCode);
    859        result += COLON;
    860        // Check for message ending with something like "foo:"
    861        if (!inBounds()) {
    862            ERROR(errorCode);
    863        } else {
    864            // Parse name part
    865            result += parseName(errorCode);
    866        }
    867    }
    868 
    869    // If there's at least one ':', scan from the first ':'
    870    // to the end of the name to check for multiple ':'s
    871    if (firstColon != -1) {
    872        for (int32_t i = firstColon + 1; i < result.length(); i++) {
    873            if (result[i] == COLON) {
    874                ERROR_AT(errorCode, i);
    875                return {};
    876            }
    877        }
    878    }
    879 
    880    return result;
    881 }
    882 
    883 /*
    884  Consumes a reference to a function, matching the ": identifier"
    885  in the `function` nonterminal in the grammar.
    886 
    887  Returns the function name.
    888 */
    889 FunctionName Parser::parseFunction(UErrorCode& errorCode) {
    890    U_ASSERT(inBounds());
    891    if (!isFunctionStart(peek())) {
    892        ERROR(errorCode);
    893        return FunctionName();
    894    }
    895 
    896    normalizedInput += peek();
    897    next(); // Consume the function start character
    898    if (!inBounds()) {
    899        ERROR(errorCode);
    900        return FunctionName();
    901    }
    902    return parseIdentifier(errorCode);
    903 }
    904 
    905 
    906 /*
    907  Precondition: peek() == BACKSLASH
    908 
    909  Consume an escaped character.
    910  Corresponds to `escaped-char` in the grammar.
    911 
    912  No postcondition (a message can end with an escaped char)
    913 */
    914 UnicodeString Parser::parseEscapeSequence(UErrorCode& errorCode) {
    915    U_ASSERT(inBounds());
    916    U_ASSERT(peek() == BACKSLASH);
    917    normalizedInput += BACKSLASH;
    918    next(); // Skip the initial backslash
    919    UnicodeString str;
    920    if (inBounds()) {
    921        // Expect a '{', '|' or '}'
    922        switch (peek()) {
    923        case LEFT_CURLY_BRACE:
    924        case RIGHT_CURLY_BRACE:
    925        case PIPE:
    926        case BACKSLASH: {
    927            /* Append to the output string */
    928            str += peek();
    929            /* Update normalizedInput */
    930            normalizedInput += peek();
    931            /* Consume the character */
    932            next();
    933            return str;
    934        }
    935        default: {
    936            // No other characters are allowed here
    937            break;
    938        }
    939        }
    940    }
    941   // If control reaches here, there was an error
    942   ERROR(errorCode);
    943   return str;
    944 }
    945 
    946 
    947 /*
    948  Consume and return a quoted literal, matching the `literal` nonterminal in the grammar.
    949 */
    950 Literal Parser::parseQuotedLiteral(UErrorCode& errorCode) {
    951    bool error = false;
    952 
    953    UnicodeString contents;
    954    if (U_SUCCESS(errorCode)) {
    955        // Parse the opening '|'
    956        parseToken(PIPE, errorCode);
    957        if (!inBounds()) {
    958            ERROR(errorCode);
    959            error = true;
    960        } else {
    961            // Parse the contents
    962            bool done = false;
    963            while (!done) {
    964                if (peek() == BACKSLASH) {
    965                    contents += parseEscapeSequence(errorCode);
    966                } else if (isQuotedChar(peek())) {
    967                    contents += peek();
    968                    // Handle cases like:
    969                    // |}{| -- we want to escape everywhere that
    970                    // can be escaped, to make round-trip checking
    971                    // easier -- so this case normalizes to
    972                    // |\}\{|
    973                    if (isEscapableChar(peek())) {
    974                        normalizedInput += BACKSLASH;
    975                    }
    976                    normalizedInput += peek();
    977                    next(); // Consume this character
    978                    maybeAdvanceLine();
    979                } else {
    980                    // Assume the sequence of literal characters ends here
    981                    done = true;
    982                }
    983                if (!inBounds()) {
    984                    ERROR(errorCode);
    985                    error = true;
    986                    break;
    987                }
    988            }
    989        }
    990    }
    991 
    992    if (error) {
    993        return {};
    994    }
    995 
    996    // Parse the closing '|'
    997    parseToken(PIPE, errorCode);
    998 
    999    return Literal(true, contents);
   1000 }
   1001 
   1002 // Parse (1*DIGIT)
   1003 UnicodeString Parser::parseDigits(UErrorCode& errorCode) {
   1004    if (U_FAILURE(errorCode)) {
   1005        return {};
   1006    }
   1007 
   1008    U_ASSERT(isDigit(peek()));
   1009 
   1010    UnicodeString contents;
   1011    do {
   1012        contents += peek();
   1013        normalizedInput += peek();
   1014        next();
   1015        if (!inBounds()) {
   1016            ERROR(errorCode);
   1017            return {};
   1018        }
   1019    } while (isDigit(peek()));
   1020 
   1021    return contents;
   1022 }
   1023 /*
   1024  Consume and return an unquoted literal, matching the `unquoted` nonterminal in the grammar.
   1025 */
   1026 Literal Parser::parseUnquotedLiteral(UErrorCode& errorCode) {
   1027    if (U_FAILURE(errorCode)) {
   1028        return {};
   1029    }
   1030    // unquoted-literal = 1*name-char
   1031 
   1032    if (!(isNameChar(peek()))) {
   1033        ERROR(errorCode);
   1034        return {};
   1035    }
   1036 
   1037    UnicodeString contents;
   1038    parseNameChars(contents, errorCode);
   1039    return Literal(false, contents);
   1040 }
   1041 
   1042 /*
   1043  Consume and return a literal, matching the `literal` nonterminal in the grammar.
   1044 */
   1045 Literal Parser::parseLiteral(UErrorCode& errorCode) {
   1046    Literal result;
   1047    if (!inBounds()) {
   1048        ERROR(errorCode);
   1049    } else {
   1050        if (peek() == PIPE) {
   1051            result = parseQuotedLiteral(errorCode);
   1052        } else {
   1053            result = parseUnquotedLiteral(errorCode);
   1054        }
   1055        // Guarantee postcondition
   1056        if (!inBounds()) {
   1057            ERROR(errorCode);
   1058        }
   1059    }
   1060 
   1061    return result;
   1062 }
   1063 
   1064 /*
   1065  Consume a @name-value pair, matching the `attribute` nonterminal in the grammar.
   1066 
   1067  Adds the option to `options`
   1068 */
   1069 template<class T>
   1070 void Parser::parseAttribute(AttributeAdder<T>& attrAdder, UErrorCode& errorCode) {
   1071    U_ASSERT(inBounds());
   1072 
   1073    U_ASSERT(peek() == AT);
   1074    // Consume the '@'
   1075    parseToken(AT, errorCode);
   1076 
   1077    // Parse LHS
   1078    UnicodeString lhs = parseIdentifier(errorCode);
   1079 
   1080    // Prepare to "backtrack" to resolve ambiguity
   1081    // about whether whitespace precedes another
   1082    // attribute, or the '=' sign
   1083    int32_t savedIndex = index;
   1084    parseOptionalWhitespace();
   1085 
   1086    Operand rand;
   1087    if (peek() == EQUALS) {
   1088        // Parse '='
   1089        parseTokenWithWhitespace(EQUALS, errorCode);
   1090 
   1091        UnicodeString rhsStr;
   1092        // Parse RHS, which must be a literal
   1093        // attribute = "@" identifier [o "=" o literal]
   1094        rand = Operand(parseLiteral(errorCode));
   1095    } else {
   1096        // attribute -> "@" identifier [[s] "=" [s]]
   1097        // Use null operand, which `rand` is already set to
   1098        // "Backtrack" by restoring the whitespace (if there was any)
   1099        index = savedIndex;
   1100    }
   1101 
   1102    attrAdder.addAttribute(lhs, std::move(Operand(rand)), errorCode);
   1103 }
   1104 
   1105 /*
   1106  Consume a name-value pair, matching the `option` nonterminal in the grammar.
   1107 
   1108  Adds the option to `optionList`
   1109 */
   1110 template<class T>
   1111 void Parser::parseOption(OptionAdder<T>& addOption, UErrorCode& errorCode) {
   1112    U_ASSERT(inBounds());
   1113 
   1114    // Parse LHS
   1115    UnicodeString lhs = parseIdentifier(errorCode);
   1116 
   1117    // Parse '='
   1118    parseTokenWithWhitespace(EQUALS, errorCode);
   1119 
   1120    UnicodeString rhsStr;
   1121    Operand rand;
   1122    // Parse RHS, which is either a literal or variable
   1123    switch (peek()) {
   1124    case DOLLAR: {
   1125        rand = Operand(parseVariableName(errorCode));
   1126        break;
   1127    }
   1128    default: {
   1129        // Must be a literal
   1130        rand = Operand(parseLiteral(errorCode));
   1131        break;
   1132    }
   1133    }
   1134    U_ASSERT(!rand.isNull());
   1135 
   1136    // Finally, add the key=value mapping
   1137    // Use a local error code, check for duplicate option error and
   1138    // record it as with other errors
   1139    UErrorCode status = U_ZERO_ERROR;
   1140    addOption.addOption(lhs, std::move(rand), status);
   1141    if (U_FAILURE(status)) {
   1142      U_ASSERT(status == U_MF_DUPLICATE_OPTION_NAME_ERROR);
   1143      errors.setDuplicateOptionName(errorCode);
   1144    }
   1145 }
   1146 
   1147 /*
   1148  Note: there are multiple overloads of parseOptions() for parsing
   1149  options within markup, vs. within an expression, vs. parsing
   1150  attributes. This should be refactored. TODO
   1151 */
   1152 
   1153 /*
   1154  Consume optional whitespace followed by a sequence of options
   1155  (possibly empty), separated by whitespace
   1156 */
   1157 template <class T>
   1158 void Parser::parseOptions(OptionAdder<T>& addOption, UErrorCode& errorCode) {
   1159    // Early exit if out of bounds -- no more work is possible
   1160    CHECK_BOUNDS(errorCode);
   1161 
   1162 /*
   1163 Arbitrary lookahead is required to parse option lists. To see why, consider
   1164 these rules from the grammar:
   1165 
   1166 expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
   1167 annotation = (function *(s option)) / reserved
   1168 
   1169 And this example:
   1170 {:foo  }
   1171 
   1172 Derivation:
   1173 expression -> "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
   1174           -> "{" [s] annotation [s] "}"
   1175           -> "{" [s] ((function *(s option)) / reserved) [s] "}"
   1176           -> "{" [s] function *(s option) [s] "}"
   1177 
   1178 In this example, knowing whether to expect a '}' or the start of another option
   1179 after the whitespace would require arbitrary lookahead -- in other words, which
   1180 rule should we apply?
   1181    *(s option) -> s option *(s option)
   1182  or
   1183    *(s option) ->
   1184 
   1185 The same would apply to the example {:foo k=v } (note the trailing space after "v").
   1186 
   1187 This is addressed using a form of backtracking and (to make the backtracking easier
   1188 to apply) a slight refactoring to the grammar.
   1189 
   1190 This code is written as if the grammar is:
   1191  expression = "{" [s] (((literal / variable) ([s] / [s annotation])) / annotation) "}"
   1192  annotation = (function *(s option) [s]) / (reserved [s])
   1193 
   1194 Parsing the `*(s option) [s]` sequence can be done within `parseOptions()`, meaning
   1195 that `parseExpression()` can safely require a '}' after `parseOptions()` finishes.
   1196 
   1197 Note that when "backtracking" really just means early exit, since only whitespace
   1198 is involved and there's no state to save.
   1199 
   1200 There is a separate but similar ambiguity as to whether the space precedes
   1201 an option or an attribute.
   1202 */
   1203 
   1204    while(true) {
   1205        // If the next character is not whitespace, that means we've already
   1206        // parsed the entire options list (which may have been empty) and there's
   1207        // no trailing whitespace. In that case, exit.
   1208        if (!isWhitespace(peek())) {
   1209            break;
   1210        }
   1211        int32_t firstWhitespace = index;
   1212 
   1213        // In any case other than an empty options list, there must be at least
   1214        // one whitespace character.
   1215        parseRequiredWhitespace(errorCode);
   1216        // Restore precondition
   1217        CHECK_BOUNDS(errorCode);
   1218 
   1219        // If a name character follows, then at least one more option remains
   1220        // in the list.
   1221        // Otherwise, we've consumed all the options and any trailing whitespace,
   1222        // and can exit.
   1223        // Note that exiting is sort of like backtracking: "(s option)" doesn't apply,
   1224        // so we back out to [s].
   1225        if (!isNameStart(peek())) {
   1226            // We've consumed all the options (meaning that either we consumed non-empty
   1227            // whitespace, or consumed at least one option.)
   1228            // Done.
   1229            // Remove the required whitespace from normalizedInput
   1230            normalizedInput.truncate(normalizedInput.length() - 1);
   1231            // "Backtrack" so as to leave the optional whitespace there
   1232            // when parsing attributes
   1233            index = firstWhitespace;
   1234            break;
   1235        }
   1236        parseOption(addOption, errorCode);
   1237    }
   1238 }
   1239 
   1240 /*
   1241  Consume optional whitespace followed by a sequence of attributes
   1242  (possibly empty), separated by whitespace
   1243 */
   1244 template<class T>
   1245 void Parser::parseAttributes(AttributeAdder<T>& attrAdder, UErrorCode& errorCode) {
   1246 
   1247    // Early exit if out of bounds -- no more work is possible
   1248    if (!inBounds()) {
   1249        ERROR(errorCode);
   1250        return;
   1251    }
   1252 
   1253 /*
   1254 Arbitrary lookahead is required to parse attribute lists, similarly to option lists.
   1255 (See comment in parseOptions()).
   1256 */
   1257 
   1258    while(true) {
   1259        // If the next character is not whitespace, that means we've already
   1260        // parsed the entire attributes list (which may have been empty) and there's
   1261        // no trailing whitespace. In that case, exit.
   1262        if (!isWhitespace(peek())) {
   1263            break;
   1264        }
   1265 
   1266        // In any case other than an empty attributes list, there must be at least
   1267        // one whitespace character.
   1268        parseRequiredWhitespace(errorCode);
   1269        // Restore precondition
   1270        if (!inBounds()) {
   1271            ERROR(errorCode);
   1272            break;
   1273        }
   1274 
   1275        // If an '@' follows, then at least one more attribute remains
   1276        // in the list.
   1277        // Otherwise, we've consumed all the attributes and any trailing whitespace,
   1278        // and can exit.
   1279        // Note that exiting is sort of like backtracking: "(s attributes)" doesn't apply,
   1280        // so we back out to [s].
   1281        if (peek() != AT) {
   1282            // We've consumed all the attributes (meaning that either we consumed non-empty
   1283            // whitespace, or consumed at least one attribute.)
   1284            // Done.
   1285            // Remove the whitespace from normalizedInput
   1286            normalizedInput.truncate(normalizedInput.length() - 1);
   1287            break;
   1288        }
   1289        parseAttribute(attrAdder, errorCode);
   1290    }
   1291 }
   1292 
   1293 /*
   1294  Consume a function call, matching the `annotation`
   1295  nonterminal in the grammar
   1296 
   1297  Returns an `Operator` representing this (a reserved is a parse error)
   1298 */
   1299 Operator Parser::parseAnnotation(UErrorCode& status) {
   1300    U_ASSERT(inBounds());
   1301    Operator::Builder ratorBuilder(status);
   1302    if (U_FAILURE(status)) {
   1303        return {};
   1304    }
   1305    if (isFunctionStart(peek())) {
   1306        // Consume the function name
   1307        FunctionName func = parseFunction(status);
   1308        ratorBuilder.setFunctionName(std::move(func));
   1309 
   1310        OptionAdder<Operator::Builder> addOptions(ratorBuilder);
   1311        // Consume the options (which may be empty)
   1312        parseOptions(addOptions, status);
   1313    } else {
   1314        ERROR(status);
   1315    }
   1316    return ratorBuilder.build(status);
   1317 }
   1318 
   1319 /*
   1320  Consume a literal or variable (depending on `isVariable`),
   1321  followed by either required whitespace followed by an annotation,
   1322  or optional whitespace.
   1323 */
   1324 void Parser::parseLiteralOrVariableWithAnnotation(bool isVariable,
   1325                                                  Expression::Builder& builder,
   1326                                                  UErrorCode& status) {
   1327    CHECK_ERROR(status);
   1328 
   1329    U_ASSERT(inBounds());
   1330 
   1331    Operand rand;
   1332    if (isVariable) {
   1333        rand = Operand(parseVariableName(status));
   1334    } else {
   1335        rand = Operand(parseLiteral(status));
   1336    }
   1337 
   1338    builder.setOperand(std::move(rand));
   1339 
   1340 /*
   1341 Parsing a literal or variable with an optional annotation requires arbitrary lookahead.
   1342 To see why, consider this rule from the grammar:
   1343 
   1344 expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
   1345 
   1346 And this example:
   1347 
   1348 {|foo|   }
   1349 
   1350 Derivation:
   1351 expression -> "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
   1352           -> "{" [s] ((literal / variable) [s annotation]) [s] "}"
   1353           -> "{" [s] (literal [s annotation]) [s] "}"
   1354 
   1355 When reading the ' ' after the second '|', it's ambiguous whether that's the required
   1356 space before an annotation, or the optional space before the '}'.
   1357 
   1358 To make this ambiguity easier to handle, this code is based on the same grammar
   1359 refactoring for the `expression` nonterminal that `parseOptions()` relies on. See
   1360 the comment in `parseOptions()` for details.
   1361 */
   1362 
   1363    if (isWhitespace(peek())) {
   1364      int32_t firstWhitespace = index;
   1365 
   1366      // If the next character is whitespace, either [s annotation] or [s] applies
   1367      // (the character is either the required space before an annotation, or optional
   1368      // trailing space after the literal or variable). It's still ambiguous which
   1369      // one does apply.
   1370      parseOptionalWhitespace();
   1371      // Restore precondition
   1372      CHECK_BOUNDS(status);
   1373 
   1374      // This next check resolves the ambiguity between [s annotation] and [s]
   1375      bool isSAnnotation = isAnnotationStart(peek());
   1376 
   1377      if (isSAnnotation) {
   1378        normalizedInput += SPACE;
   1379      }
   1380 
   1381      if (isSAnnotation) {
   1382        // The previously consumed whitespace precedes an annotation
   1383        builder.setOperator(parseAnnotation(status));
   1384      } else {
   1385          // Either there's a right curly brace (will be consumed by the caller),
   1386          // or there's an error and the trailing whitespace should be
   1387          // handled by the caller. However, this is not an error
   1388          // here because we're just parsing `literal [s annotation]`.
   1389          index = firstWhitespace;
   1390      }
   1391    } else {
   1392      // Either there was never whitespace, or
   1393      // the previously consumed whitespace is the optional trailing whitespace;
   1394      // either the next character is '}' or the error will be handled by parseExpression.
   1395      // Do nothing, since the operand was already set
   1396    }
   1397 
   1398    // At the end of this code, the next character should either be '}',
   1399    // whitespace followed by a '}',
   1400    // or end-of-input
   1401 }
   1402 
   1403 /*
   1404  Consume an expression, matching the `expression` nonterminal in the grammar
   1405 */
   1406 
   1407 static void exprFallback(Expression::Builder& exprBuilder) {
   1408    // Construct a literal consisting just of  The U+FFFD REPLACEMENT CHARACTER
   1409    // per https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md#fallback-resolution
   1410    exprBuilder.setOperand(Operand(Literal(false, UnicodeString(REPLACEMENT))));
   1411 }
   1412 
   1413 static Expression exprFallback(UErrorCode& status) {
   1414    Expression result;
   1415    if (U_SUCCESS(status)) {
   1416        Expression::Builder exprBuilder(status);
   1417        if (U_SUCCESS(status)) {
   1418            // Construct a literal consisting just of  The U+FFFD REPLACEMENT CHARACTER
   1419            // per https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md#fallback-resolution
   1420            exprBuilder.setOperand(Operand(Literal(false, UnicodeString(REPLACEMENT))));
   1421            UErrorCode status = U_ZERO_ERROR;
   1422            result = exprBuilder.build(status);
   1423            // An operand was set, so there can't be an error
   1424            U_ASSERT(U_SUCCESS(status));
   1425        }
   1426    }
   1427    return result;
   1428 }
   1429 
   1430 Expression Parser::parseExpression(UErrorCode& status) {
   1431    if (U_FAILURE(status)) {
   1432        return {};
   1433    }
   1434 
   1435    // Early return if out of input -- no more work is possible
   1436    U_ASSERT(inBounds());
   1437 
   1438    // Parse opening brace
   1439    parseToken(LEFT_CURLY_BRACE, status);
   1440    // Optional whitespace after opening brace
   1441    parseOptionalWhitespace();
   1442 
   1443    Expression::Builder exprBuilder(status);
   1444    // Restore precondition
   1445    if (!inBounds()) {
   1446        exprFallback(exprBuilder);
   1447    } else {
   1448        // literal '|', variable '$' or annotation
   1449        switch (peek()) {
   1450        case PIPE: {
   1451            // Quoted literal
   1452            parseLiteralOrVariableWithAnnotation(false, exprBuilder, status);
   1453            break;
   1454        }
   1455        case DOLLAR: {
   1456            // Variable
   1457            parseLiteralOrVariableWithAnnotation(true, exprBuilder, status);
   1458            break;
   1459        }
   1460        default: {
   1461            if (isAnnotationStart(peek())) {
   1462                Operator rator = parseAnnotation(status);
   1463                exprBuilder.setOperator(std::move(rator));
   1464            } else if (isUnquotedStart(peek())) {
   1465                // Unquoted literal
   1466                parseLiteralOrVariableWithAnnotation(false, exprBuilder, status);
   1467            } else {
   1468                // Not a literal, variable or annotation -- error out
   1469                ERROR(status);
   1470                exprFallback(exprBuilder);
   1471                break;
   1472            }
   1473            break;
   1474        }
   1475        }
   1476    }
   1477 
   1478    // Parse attributes
   1479    AttributeAdder<Expression::Builder> attrAdder(exprBuilder);
   1480    parseAttributes(attrAdder, status);
   1481 
   1482    // Parse optional space
   1483    // (the last [s] in e.g. "{" [s] literal [s annotation] *(s attribute) [s] "}")
   1484    parseOptionalWhitespace();
   1485 
   1486    // Either an operand or operator (or both) must have been set already,
   1487    // so there can't be an error
   1488    UErrorCode localStatus = U_ZERO_ERROR;
   1489    Expression result = exprBuilder.build(localStatus);
   1490    U_ASSERT(U_SUCCESS(localStatus));
   1491 
   1492    // Check for end-of-input and missing '}'
   1493    if (!inBounds()) {
   1494        ERROR(status);
   1495    } else {
   1496        // Otherwise, it's safe to check for the '}'
   1497        parseToken(RIGHT_CURLY_BRACE, status);
   1498    }
   1499    return result;
   1500 }
   1501 
   1502 /*
   1503  Parse a .local declaration, matching the `local-declaration`
   1504  production in the grammar
   1505 */
   1506 void Parser::parseLocalDeclaration(UErrorCode& status) {
   1507    // End-of-input here would be an error; even empty
   1508    // declarations must be followed by a body
   1509    CHECK_BOUNDS(status);
   1510 
   1511    parseToken(ID_LOCAL, status);
   1512    parseRequiredWhitespace(status);
   1513 
   1514    // Restore precondition
   1515    CHECK_BOUNDS(status);
   1516    VariableName lhs = parseVariableName(status);
   1517    parseTokenWithWhitespace(EQUALS, status);
   1518    // Restore precondition before calling parseExpression()
   1519    CHECK_BOUNDS(status);
   1520 
   1521    Expression rhs = parseExpression(status);
   1522 
   1523    // Add binding from lhs to rhs, unless there was an error
   1524    // (This ensures that if there was a correct lhs but a
   1525    // parse error in rhs, the fallback for uses of the
   1526    // lhs will be its own name rather than the rhs)
   1527    /* This affects the behavior of this test case, which the spec
   1528       is ambiguous about:
   1529 
   1530       .local $bar {|foo|} {{{$bar}}}
   1531 
   1532       Should `$bar` still be bound to a value although
   1533       its declaration is syntactically incorrect (missing the '=')?
   1534       This code says no, but it needs to change if
   1535       https://github.com/unicode-org/message-format-wg/issues/703
   1536       is resolved differently.
   1537    */
   1538    CHECK_ERROR(status);
   1539    if (!errors.hasSyntaxError()) {
   1540        dataModel.addBinding(Binding(std::move(lhs), std::move(rhs)), status);
   1541        // Check if status is U_DUPLICATE_DECLARATION_ERROR
   1542        // and add that as an internal error if so
   1543        if (status == U_MF_DUPLICATE_DECLARATION_ERROR) {
   1544            status = U_ZERO_ERROR;
   1545            errors.addError(StaticErrorType::DuplicateDeclarationError, status);
   1546        }
   1547    }
   1548 }
   1549 
   1550 /*
   1551  Parse an .input declaration, matching the `local-declaration`
   1552  production in the grammar
   1553 */
   1554 void Parser::parseInputDeclaration(UErrorCode& status) {
   1555    // End-of-input here would be an error; even empty
   1556    // declarations must be followed by a body
   1557    CHECK_BOUNDS(status);
   1558 
   1559    parseToken(ID_INPUT, status);
   1560    parseOptionalWhitespace();
   1561 
   1562    // Restore precondition before calling parseExpression()
   1563    CHECK_BOUNDS(status);
   1564 
   1565    // Save the index for error diagnostics
   1566    int32_t exprIndex = index;
   1567    Expression rhs = parseExpression(status);
   1568 
   1569    // Here we have to check that the rhs is a variable-expression
   1570    if (!rhs.getOperand().isVariable()) {
   1571        // This case is a syntax error; report it at the beginning
   1572        // of the expression
   1573        ERROR_AT(status, exprIndex);
   1574        return;
   1575    }
   1576 
   1577    VariableName lhs = rhs.getOperand().asVariable();
   1578 
   1579    // Add binding from lhs to rhs
   1580    // This just adds a new local variable that shadows the message
   1581    // argument referred to, which is harmless.
   1582    // When evaluating the RHS, the new local is not in scope
   1583    // and the message argument will be correctly referred to.
   1584    CHECK_ERROR(status);
   1585    if (!errors.hasSyntaxError()) {
   1586        dataModel.addBinding(Binding::input(std::move(lhs), std::move(rhs), status), status);
   1587        // Check if status is U_MF_DUPLICATE_DECLARATION_ERROR
   1588        // and add that as an internal error if so
   1589        if (status == U_MF_DUPLICATE_DECLARATION_ERROR) {
   1590            status = U_ZERO_ERROR;
   1591            errors.addError(StaticErrorType::DuplicateDeclarationError, status);
   1592        }
   1593    }
   1594 }
   1595 
   1596 /*
   1597  Consume a possibly-empty sequence of declarations separated by whitespace;
   1598  each declaration matches the `declaration` nonterminal in the grammar
   1599 
   1600  Builds up an environment representing those declarations
   1601 */
   1602 void Parser::parseDeclarations(UErrorCode& status) {
   1603    // End-of-input here would be an error; even empty
   1604    // declarations must be followed by a body
   1605    CHECK_BOUNDS(status);
   1606 
   1607    while (peek() == PERIOD) {
   1608        CHECK_BOUNDS_1(status);
   1609        if (peek(1) == ID_LOCAL[1]) {
   1610            parseLocalDeclaration(status);
   1611        } else if (peek(1) == ID_INPUT[1]) {
   1612            parseInputDeclaration(status);
   1613        } else {
   1614            // Done parsing declarations
   1615            break;
   1616        }
   1617 
   1618        // Avoid looping infinitely
   1619        CHECK_ERROR(status);
   1620 
   1621        parseOptionalWhitespace();
   1622        // Restore precondition
   1623        CHECK_BOUNDS(status);
   1624    }
   1625 }
   1626 
   1627 /*
   1628  Consume a text character
   1629  matching the `text-char` nonterminal in the grammar
   1630 
   1631  No postcondition (a message can end with a text-char)
   1632 */
   1633 UnicodeString Parser::parseTextChar(UErrorCode& status) {
   1634    UnicodeString str;
   1635    if (!inBounds() || !(isTextChar(peek()))) {
   1636        // Error -- text-char is expected here
   1637        ERROR(status);
   1638    } else {
   1639        // See comment in parseQuotedLiteral()
   1640        if (isEscapableChar(peek())) {
   1641            normalizedInput += BACKSLASH;
   1642        }
   1643        normalizedInput += peek();
   1644        str += peek();
   1645        next();
   1646        maybeAdvanceLine();
   1647    }
   1648    return str;
   1649 }
   1650 
   1651 /*
   1652  Consume an `nmtoken`, `literal`, or the string "*", matching
   1653  the `key` nonterminal in the grammar
   1654 */
   1655 Key Parser::parseKey(UErrorCode& status) {
   1656    U_ASSERT(inBounds());
   1657 
   1658    Key k; // wildcard by default
   1659    // Literal | '*'
   1660    switch (peek()) {
   1661    case ASTERISK: {
   1662        next();
   1663        normalizedInput += ASTERISK;
   1664        // Guarantee postcondition
   1665        if (!inBounds()) {
   1666            ERROR(status);
   1667            return k;
   1668        }
   1669        break;
   1670    }
   1671    default: {
   1672        // Literal
   1673        k = Key(parseLiteral(status));
   1674        break;
   1675    }
   1676    }
   1677    return k;
   1678 }
   1679 
   1680 /*
   1681  Consume a non-empty sequence of `key`s separated by whitespace
   1682 
   1683  Takes ownership of `keys`
   1684 */
   1685 SelectorKeys Parser::parseNonEmptyKeys(UErrorCode& status) {
   1686    SelectorKeys result;
   1687 
   1688    if (U_FAILURE(status)) {
   1689        return result;
   1690    }
   1691 
   1692    U_ASSERT(inBounds());
   1693 
   1694 /*
   1695 Arbitrary lookahead is required to parse key lists. To see why, consider
   1696 this rule from the grammar:
   1697 
   1698 variant = key *(s key) [s] quoted-pattern
   1699 
   1700 And this example:
   1701 when k1 k2   {a}
   1702 
   1703 Derivation:
   1704   variant -> key *(s key) [s] quoted-pattern
   1705           -> key s key *(s key) quoted-pattern
   1706 
   1707 After matching ' ' to `s` and 'k2' to `key`, it would require arbitrary lookahead
   1708 to know whether to expect the start of a pattern or the start of another key.
   1709 In other words: is the second whitespace sequence the required space in *(s key),
   1710 or the optional space in [s] quoted-pattern?
   1711 
   1712 This is addressed using "backtracking" (similarly to `parseOptions()`).
   1713 */
   1714 
   1715    SelectorKeys::Builder keysBuilder(status);
   1716    if (U_FAILURE(status)) {
   1717        return result;
   1718    }
   1719 
   1720    // Since the first key is required, it's simplest to parse it separately.
   1721    keysBuilder.add(parseKey(status), status);
   1722 
   1723    // Restore precondition
   1724    if (!inBounds()) {
   1725        ERROR(status);
   1726        return result;
   1727    }
   1728 
   1729    // We've seen at least one whitespace-key pair, so now we can parse
   1730    // *(s key) [s]
   1731    while (peek() != LEFT_CURLY_BRACE || isWhitespace(peek()) || isBidiControl(peek())) {
   1732        bool wasWhitespace = isWhitespace(peek()) || isBidiControl(peek());
   1733        parseRequiredWhitespace(status);
   1734        if (!wasWhitespace) {
   1735            // Avoid infinite loop when parsing something like:
   1736            // when * @{!...
   1737            next();
   1738        }
   1739 
   1740        // Restore precondition
   1741        if (!inBounds()) {
   1742            ERROR(status);
   1743            return result;
   1744        }
   1745 
   1746        // At this point, it's ambiguous whether we are inside (s key) or [s].
   1747        // This check resolves that ambiguity.
   1748        if (peek() == LEFT_CURLY_BRACE) {
   1749            // A pattern follows, so what we just parsed was the optional
   1750            // trailing whitespace. All the keys have been parsed.
   1751 
   1752            // Unpush the whitespace from `normalizedInput`
   1753            normalizedInput.truncate(normalizedInput.length() - 1);
   1754            break;
   1755        }
   1756        keysBuilder.add(parseKey(status), status);
   1757    }
   1758 
   1759    return keysBuilder.build(status);
   1760 }
   1761 
   1762 Pattern Parser::parseQuotedPattern(UErrorCode& status) {
   1763    U_ASSERT(inBounds());
   1764 
   1765    parseToken(LEFT_CURLY_BRACE, status);
   1766    parseToken(LEFT_CURLY_BRACE, status);
   1767    Pattern p = parseSimpleMessage(status);
   1768    parseToken(RIGHT_CURLY_BRACE, status);
   1769    parseToken(RIGHT_CURLY_BRACE, status);
   1770    return p;
   1771 }
   1772 
   1773 /*
   1774  Consume a `placeholder`, matching the nonterminal in the grammar
   1775  No postcondition (a markup can end a message)
   1776 */
   1777 Markup Parser::parseMarkup(UErrorCode& status) {
   1778    U_ASSERT(inBounds(1));
   1779 
   1780    U_ASSERT(peek() == LEFT_CURLY_BRACE);
   1781 
   1782    Markup::Builder builder(status);
   1783    if (U_FAILURE(status)) {
   1784        return {};
   1785    }
   1786 
   1787    // Consume the '{'
   1788    next();
   1789    normalizedInput += LEFT_CURLY_BRACE;
   1790    parseOptionalWhitespace();
   1791    bool closing = false;
   1792    switch (peek()) {
   1793    case NUMBER_SIGN: {
   1794        // Open or standalone; consume the '#'
   1795        normalizedInput += peek();
   1796        next();
   1797        break;
   1798    }
   1799    case SLASH: {
   1800        // Closing
   1801        normalizedInput += peek();
   1802        closing = true;
   1803        next();
   1804        break;
   1805    }
   1806    default: {
   1807        ERROR(status);
   1808        return {};
   1809    }
   1810    }
   1811 
   1812    // Parse the markup identifier
   1813    builder.setName(parseIdentifier(status));
   1814 
   1815    // Parse the options, which must begin with a ' '
   1816    // if present
   1817    if (inBounds() && (isWhitespace(peek()) || isBidiControl(peek()))) {
   1818        OptionAdder<Markup::Builder> optionAdder(builder);
   1819        parseOptions(optionAdder, status);
   1820    }
   1821 
   1822    // Parse the attributes, which also must begin
   1823    // with a ' '
   1824    if (inBounds() && (isWhitespace(peek()) || isBidiControl(peek()))) {
   1825        AttributeAdder<Markup::Builder> attrAdder(builder);
   1826        parseAttributes(attrAdder, status);
   1827    }
   1828 
   1829    parseOptionalWhitespace();
   1830 
   1831    bool standalone = false;
   1832    // Check if this is a standalone or not
   1833    if (!closing) {
   1834        if (inBounds() && peek() == SLASH) {
   1835            standalone = true;
   1836            normalizedInput += SLASH;
   1837            next();
   1838        }
   1839    }
   1840 
   1841    parseToken(RIGHT_CURLY_BRACE, status);
   1842 
   1843    if (standalone) {
   1844        builder.setStandalone();
   1845    } else if (closing) {
   1846        builder.setClose();
   1847    } else {
   1848        builder.setOpen();
   1849    }
   1850 
   1851    return builder.build(status);
   1852 }
   1853 
   1854 /*
   1855  Consume a `placeholder`, matching the nonterminal in the grammar
   1856  No postcondition (a placeholder can end a message)
   1857 */
   1858 std::variant<Expression, Markup> Parser::parsePlaceholder(UErrorCode& status) {
   1859    U_ASSERT(peek() == LEFT_CURLY_BRACE);
   1860 
   1861    if (!inBounds()) {
   1862        ERROR(status);
   1863        return exprFallback(status);
   1864    }
   1865 
   1866    // Need to look ahead arbitrarily since whitespace
   1867    // can appear before the '{' and '#'
   1868    // in markup
   1869    int32_t tempIndex = 1;
   1870    bool isMarkup = false;
   1871    while (inBounds(1)) {
   1872        UChar32 c = peek(tempIndex);
   1873        if (c == NUMBER_SIGN || c == SLASH) {
   1874            isMarkup = true;
   1875            break;
   1876        }
   1877        if (!(isWhitespace(c) || isBidiControl(c))) {
   1878            break;
   1879        }
   1880        tempIndex++;
   1881    }
   1882 
   1883    if (isMarkup) {
   1884        return parseMarkup(status);
   1885    }
   1886    return parseExpression(status);
   1887 }
   1888 
   1889 /*
   1890  Consume a `simple-message`, matching the nonterminal in the grammar
   1891  Postcondition: `index == len()` or U_FAILURE(status);
   1892  for a syntactically correct message, this will consume the entire input
   1893 */
   1894 Pattern Parser::parseSimpleMessage(UErrorCode& status) {
   1895    Pattern::Builder result(status);
   1896 
   1897    if (U_SUCCESS(status)) {
   1898        Expression expression;
   1899        while (inBounds()) {
   1900            switch (peek()) {
   1901            case LEFT_CURLY_BRACE: {
   1902                // Must be placeholder
   1903                std::variant<Expression, Markup> piece = parsePlaceholder(status);
   1904                if (std::holds_alternative<Expression>(piece)) {
   1905                    Expression expr = *std::get_if<Expression>(&piece);
   1906                    result.add(std::move(expr), status);
   1907                } else {
   1908                    Markup markup = *std::get_if<Markup>(&piece);
   1909                    result.add(std::move(markup), status);
   1910                }
   1911                break;
   1912            }
   1913            case BACKSLASH: {
   1914                // Must be escaped-char
   1915                result.add(parseEscapeSequence(status), status);
   1916                break;
   1917            }
   1918            case RIGHT_CURLY_BRACE: {
   1919                // Distinguish unescaped '}' from end of quoted pattern
   1920                break;
   1921            }
   1922            default: {
   1923                // Must be text-char
   1924                result.add(parseTextChar(status), status);
   1925                break;
   1926            }
   1927            }
   1928            if (peek() == RIGHT_CURLY_BRACE) {
   1929                // End of quoted pattern
   1930                break;
   1931            }
   1932            // Don't loop infinitely
   1933            if (errors.hasSyntaxError() || U_FAILURE(status)) {
   1934                break;
   1935            }
   1936        }
   1937    }
   1938    return result.build(status);
   1939 }
   1940 
   1941 void Parser::parseVariant(UErrorCode& status) {
   1942    CHECK_ERROR(status);
   1943 
   1944    // At least one key is required
   1945    SelectorKeys keyList(parseNonEmptyKeys(status));
   1946 
   1947    // parseNonEmptyKeys() consumes any trailing whitespace,
   1948    // so the pattern can be consumed next.
   1949 
   1950    // Restore precondition before calling parsePattern()
   1951    // (which must return a non-null value)
   1952    CHECK_BOUNDS(status);
   1953    Pattern rhs = parseQuotedPattern(status);
   1954 
   1955    dataModel.addVariant(std::move(keyList), std::move(rhs), status);
   1956 }
   1957 
   1958 /*
   1959  Consume a `selectors` (matching the nonterminal in the grammar),
   1960  followed by a non-empty sequence of `variant`s (matching the nonterminal
   1961  in the grammar) preceded by whitespace
   1962  No postcondition (on return, `index` might equal `len()` with no syntax error
   1963  because a message can end with a variant)
   1964 */
   1965 void Parser::parseSelectors(UErrorCode& status) {
   1966    CHECK_ERROR(status);
   1967 
   1968    U_ASSERT(inBounds());
   1969 
   1970    parseToken(ID_MATCH, status);
   1971 
   1972    bool empty = true;
   1973    // Parse selectors
   1974    // "Backtracking" is required here. It's not clear if whitespace is
   1975    // (`[s]` selector) or (`[s]` variant)
   1976    while (isWhitespace(peek()) || peek() == DOLLAR) {
   1977        int32_t whitespaceStart = index;
   1978        parseRequiredWhitespace(status);
   1979        // Restore precondition
   1980        CHECK_BOUNDS(status);
   1981        if (peek() != DOLLAR) {
   1982            // This is not necessarily an error, but rather,
   1983            // means the whitespace we parsed was the optional
   1984            // whitespace preceding the first variant, not the
   1985            // required whitespace preceding a subsequent variable.
   1986            // In that case, "push back" the whitespace.
   1987            normalizedInput.truncate(normalizedInput.length() - 1);
   1988            index = whitespaceStart;
   1989            break;
   1990        }
   1991        VariableName var = parseVariableName(status);
   1992        empty = false;
   1993 
   1994        dataModel.addSelector(std::move(var), status);
   1995        CHECK_ERROR(status);
   1996    }
   1997 
   1998    // At least one selector is required
   1999    if (empty) {
   2000        ERROR(status);
   2001        return;
   2002    }
   2003 
   2004    #define CHECK_END_OF_INPUT                     \
   2005        if (!inBounds()) {                         \
   2006            break;                                 \
   2007        }                                          \
   2008 
   2009    // Parse variants
   2010    // matcher = match-statement s variant *(o variant)
   2011 
   2012    // Parse first variant
   2013    parseRequiredWhitespace(status);
   2014    if (!inBounds()) {
   2015        ERROR(status);
   2016        return;
   2017    }
   2018    parseVariant(status);
   2019    if (!inBounds()) {
   2020        // Not an error; there might be only one variant
   2021        return;
   2022    }
   2023 
   2024    while (isWhitespace(peek()) || isBidiControl(peek()) || isKeyStart(peek())) {
   2025        parseOptionalWhitespace();
   2026        // Restore the precondition.
   2027        // Trailing whitespace is allowed.
   2028        if (!inBounds()) {
   2029            return;
   2030        }
   2031 
   2032        parseVariant(status);
   2033 
   2034        // Restore the precondition, *without* erroring out if we've
   2035        // reached the end of input. That's because it's valid for the
   2036        // message to end with a variant that has no trailing whitespace.
   2037        // Why do we need to check this condition twice inside the loop?
   2038        // Because if we don't check it here, the `isWhitespace()` call in
   2039        // the loop head will read off the end of the input string.
   2040        CHECK_END_OF_INPUT
   2041 
   2042        if (errors.hasSyntaxError() || U_FAILURE(status)) {
   2043            break;
   2044        }
   2045    }
   2046 }
   2047 
   2048 /*
   2049  Consume a `body` (matching the nonterminal in the grammar),
   2050  No postcondition (on return, `index` might equal `len()` with no syntax error,
   2051  because a message can end with a body (trailing whitespace is optional)
   2052 */
   2053 
   2054 void Parser::errorPattern(UErrorCode& status) {
   2055    errors.addSyntaxError(status);
   2056    // Set to empty pattern
   2057    Pattern::Builder result = Pattern::Builder(status);
   2058    CHECK_ERROR(status);
   2059 
   2060    // If still in bounds, then add the remaining input as a single text part
   2061    // to the pattern
   2062    /*
   2063      TODO: this behavior isn't documented in the spec, but it comes from
   2064      https://github.com/messageformat/messageformat/blob/e0087bff312d759b67a9129eac135d318a1f0ce7/packages/mf2-messageformat/src/__fixtures/test-messages.json#L236
   2065      and a pending pull request https://github.com/unicode-org/message-format-wg/pull/462 will clarify
   2066      whether this is the intent behind the spec
   2067     */
   2068    UnicodeString partStr(LEFT_CURLY_BRACE);
   2069    while (inBounds()) {
   2070        partStr += peek();
   2071        next();
   2072    }
   2073    // Add curly braces around the entire output (same comment as above)
   2074    partStr += RIGHT_CURLY_BRACE;
   2075    result.add(std::move(partStr), status);
   2076    dataModel.setPattern(result.build(status));
   2077 }
   2078 
   2079 void Parser::parseBody(UErrorCode& status) {
   2080    CHECK_ERROR(status);
   2081 
   2082    // Out-of-input is a syntax warning
   2083    if (!inBounds()) {
   2084        errorPattern(status);
   2085        return;
   2086    }
   2087 
   2088    // Body must be either a pattern or selectors
   2089    switch (peek()) {
   2090    case LEFT_CURLY_BRACE: {
   2091        // Pattern
   2092        dataModel.setPattern(parseQuotedPattern(status));
   2093        break;
   2094    }
   2095    case ID_MATCH[0]: {
   2096        // Selectors
   2097        parseSelectors(status);
   2098        return;
   2099    }
   2100    default: {
   2101        ERROR(status);
   2102        errorPattern(status);
   2103        return;
   2104    }
   2105    }
   2106 }
   2107 
   2108 // -------------------------------------
   2109 // Parses the source pattern.
   2110 
   2111 void Parser::parse(UParseError &parseErrorResult, UErrorCode& status) {
   2112    CHECK_ERROR(status);
   2113 
   2114    bool complex = false;
   2115    // First, "look ahead" to determine if this is a simple or complex
   2116    // message. To do that, check the first non-whitespace character.
   2117    while (inBounds(index) && (isWhitespace(peek()) || isBidiControl(peek()))) {
   2118        next();
   2119    }
   2120 
   2121    // Message can be empty, so we need to only look ahead
   2122    // if we know it's non-empty
   2123    if (inBounds()) {
   2124        if (peek() == PERIOD
   2125            || (inBounds(1)
   2126                && peek() == LEFT_CURLY_BRACE
   2127                && peek(1) == LEFT_CURLY_BRACE)) {
   2128            complex = true;
   2129        }
   2130    }
   2131    // Reset index
   2132    index = 0;
   2133 
   2134    // Message can be empty, so we need to only look ahead
   2135    // if we know it's non-empty
   2136    if (complex) {
   2137        parseOptionalWhitespace();
   2138        parseDeclarations(status);
   2139        parseBody(status);
   2140        parseOptionalWhitespace();
   2141    } else {
   2142        // Simple message
   2143        // For normalization, quote the pattern
   2144        normalizedInput += LEFT_CURLY_BRACE;
   2145        normalizedInput += LEFT_CURLY_BRACE;
   2146        dataModel.setPattern(parseSimpleMessage(status));
   2147        normalizedInput += RIGHT_CURLY_BRACE;
   2148        normalizedInput += RIGHT_CURLY_BRACE;
   2149    }
   2150 
   2151    CHECK_ERROR(status);
   2152 
   2153    // There are no errors; finally, check that the entire input was consumed
   2154    if (!allConsumed()) {
   2155        ERROR(status);
   2156    }
   2157 
   2158    // Finally, copy the relevant fields of the internal `MessageParseError`
   2159    // into the `UParseError` argument
   2160    translateParseError(parseError, parseErrorResult);
   2161 }
   2162 
   2163 Parser::~Parser() {}
   2164 
   2165 } // namespace message2
   2166 U_NAMESPACE_END
   2167 
   2168 #endif /* #if !UCONFIG_NO_MF2 */
   2169 
   2170 #endif /* #if !UCONFIG_NO_FORMATTING */
   2171 
   2172 #endif /* #if !UCONFIG_NO_NORMALIZATION */