messageformat2_parser.cpp (66568B)
1 // © 2024 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 4 #include "unicode/utypes.h" 5 6 #if !UCONFIG_NO_NORMALIZATION 7 8 #if !UCONFIG_NO_FORMATTING 9 10 #if !UCONFIG_NO_MF2 11 12 #include "unicode/uniset.h" 13 #include "messageformat2_errors.h" 14 #include "messageformat2_macros.h" 15 #include "messageformat2_parser.h" 16 #include "ucln_in.h" 17 #include "umutex.h" 18 #include "uvector.h" // U_ASSERT 19 20 U_NAMESPACE_BEGIN 21 22 namespace message2 { 23 24 using namespace pluralimpl; 25 26 using namespace data_model; 27 28 /* 29 The `ERROR()` macro sets a syntax error in the context 30 and sets the offset in `parseError` to `index`. It does not alter control flow. 31 */ 32 #define ERROR(errorCode) \ 33 if (!errors.hasSyntaxError()) { \ 34 setParseError(parseError, index); \ 35 errors.addSyntaxError(errorCode); \ 36 } 37 38 #define ERROR_AT(errorCode, i) \ 39 if (!errors.hasSyntaxError()) { \ 40 setParseError(parseError, i); \ 41 errors.addSyntaxError(errorCode); \ 42 } 43 44 // Increments the line number and updates the "characters seen before 45 // current line" count in `parseError`, iff `peek()` is a newline 46 void Parser::maybeAdvanceLine() { 47 if (peek() == LF) { 48 parseError.line++; 49 // add 1 to index to get the number of characters seen so far 50 // (including the newline) 51 parseError.lengthBeforeCurrentLine = index + 1; 52 } 53 } 54 55 /* 56 Signals an error and returns either if `parseError` already denotes an 57 error, or `index` is out of bounds for the string `source` 58 */ 59 #define CHECK_BOUNDS(errorCode) \ 60 if (!inBounds()) { \ 61 ERROR(errorCode); \ 62 return; \ 63 } 64 #define CHECK_BOUNDS_1(errorCode) \ 65 if (!inBounds(1)) { \ 66 ERROR_AT(errorCode, index + 1); \ 67 return; \ 68 } 69 70 // ------------------------------------- 71 // Helper functions 72 73 static void copyContext(const UChar in[U_PARSE_CONTEXT_LEN], UChar out[U_PARSE_CONTEXT_LEN]) { 74 for (int32_t i = 0; i < U_PARSE_CONTEXT_LEN; i++) { 75 out[i] = in[i]; 76 if (in[i] == '\0') { 77 break; 78 } 79 } 80 } 81 82 /* static */ void Parser::translateParseError(const MessageParseError &messageParseError, UParseError &parseError) { 83 parseError.line = messageParseError.line; 84 parseError.offset = messageParseError.offset; 85 copyContext(messageParseError.preContext, parseError.preContext); 86 copyContext(messageParseError.postContext, parseError.postContext); 87 } 88 89 /* static */ void Parser::setParseError(MessageParseError &parseError, uint32_t index) { 90 // Translate absolute to relative offset 91 parseError.offset = index // Start with total number of characters seen 92 - parseError.lengthBeforeCurrentLine; // Subtract all characters before the current line 93 // TODO: Fill this in with actual pre and post-context 94 parseError.preContext[0] = 0; 95 parseError.postContext[0] = 0; 96 } 97 98 // ------------------------------------- 99 // Initialization of UnicodeSets 100 101 namespace unisets { 102 103 UnicodeSet* gUnicodeSets[unisets::UNISETS_KEY_COUNT] = {}; 104 105 inline UnicodeSet* getImpl(Key key) { 106 return gUnicodeSets[key]; 107 } 108 109 icu::UInitOnce gMF2ParseUniSetsInitOnce {}; 110 } 111 112 UnicodeSet* initContentChars(UErrorCode& status) { 113 if (U_FAILURE(status)) { 114 return nullptr; 115 } 116 117 UnicodeSet* result = new UnicodeSet(0x0001, 0x0008); // Omit NULL, HTAB and LF 118 if (result == nullptr) { 119 status = U_MEMORY_ALLOCATION_ERROR; 120 return nullptr; 121 } 122 result->add(0x000B, 0x000C); // Omit CR 123 result->add(0x000E, 0x001F); // Omit SP 124 result->add(0x0021, 0x002D); // Omit '.' 125 result->add(0x002F, 0x003F); // Omit '@' 126 result->add(0x0041, 0x005B); // Omit '\' 127 result->add(0x005D, 0x007A); // Omit { | } 128 result->add(0x007E, 0x2FFF); // Omit IDEOGRAPHIC_SPACE 129 result->add(0x3001, 0x10FFFF); // Allowing surrogates is intentional 130 result->freeze(); 131 return result; 132 } 133 134 UnicodeSet* initWhitespace(UErrorCode& status) { 135 if (U_FAILURE(status)) { 136 return nullptr; 137 } 138 139 UnicodeSet* result = new UnicodeSet(); 140 if (result == nullptr) { 141 status = U_MEMORY_ALLOCATION_ERROR; 142 return nullptr; 143 } 144 result->add(SPACE); 145 result->add(HTAB); 146 result->add(CR); 147 result->add(LF); 148 result->add(IDEOGRAPHIC_SPACE); 149 result->freeze(); 150 return result; 151 } 152 153 UnicodeSet* initBidiControls(UErrorCode& status) { 154 UnicodeSet* result = new UnicodeSet(UnicodeString("[\\u061C]"), status); 155 if (U_FAILURE(status)) { 156 return nullptr; 157 } 158 result->add(0x200E, 0x200F); 159 result->add(0x2066, 0x2069); 160 result->freeze(); 161 return result; 162 } 163 164 UnicodeSet* initAlpha(UErrorCode& status) { 165 UnicodeSet* result = new UnicodeSet(UnicodeString("[:letter:]"), status); 166 if (U_FAILURE(status)) { 167 return nullptr; 168 } 169 result->freeze(); 170 return result; 171 } 172 173 UnicodeSet* initDigits(UErrorCode& status) { 174 UnicodeSet* result = new UnicodeSet(UnicodeString("[:number:]"), status); 175 if (U_FAILURE(status)) { 176 return nullptr; 177 } 178 result->freeze(); 179 return result; 180 } 181 182 UnicodeSet* initNameStartChars(UErrorCode& status) { 183 if (U_FAILURE(status)) { 184 return nullptr; 185 } 186 187 UnicodeSet* isAlpha = unisets::gUnicodeSets[unisets::ALPHA] = initAlpha(status); 188 if (U_FAILURE(status)) { 189 return nullptr; 190 } 191 UnicodeSet* result = new UnicodeSet(); 192 if (result == nullptr) { 193 status = U_MEMORY_ALLOCATION_ERROR; 194 return nullptr; 195 }; 196 197 result->addAll(*isAlpha); 198 result->add(0x002B); 199 result->add(0x005F); 200 result->add(0x00A1, 0x061B); 201 result->add(0x061D, 0x167F); 202 result->add(0x1681, 0x1FFF); 203 result->add(0x200B, 0x200D); 204 result->add(0x2010, 0x2027); 205 result->add(0x2030, 0x205E); 206 result->add(0x2060, 0x2065); 207 result->add(0x206A, 0x2FFF); 208 result->add(0x3001, 0xD7FF); 209 result->add(0xE000, 0xFDCF); 210 result->add(0xFDF0, 0xFFFD); 211 result->add(0x10000, 0x1FFFD); 212 result->add(0x20000, 0x2FFFD); 213 result->add(0x30000, 0x3FFFD); 214 result->add(0x40000, 0x4FFFD); 215 result->add(0x50000, 0x5FFFD); 216 result->add(0x60000, 0x6FFFD); 217 result->add(0x70000, 0x7FFFD); 218 result->add(0x80000, 0x8FFFD); 219 result->add(0x90000, 0x9FFFD); 220 result->add(0xA0000, 0xAFFFD); 221 result->add(0xB0000, 0xBFFFD); 222 result->add(0xC0000, 0xCFFFD); 223 result->add(0xD0000, 0xDFFFD); 224 result->add(0xE0000, 0xEFFFD); 225 result->add(0xF0000, 0xFFFFD); 226 result->add(0x100000, 0x10FFFD); 227 result->freeze(); 228 return result; 229 } 230 231 UnicodeSet* initNameChars(UErrorCode& status) { 232 if (U_FAILURE(status)) { 233 return nullptr; 234 } 235 236 UnicodeSet* nameStart = unisets::gUnicodeSets[unisets::NAME_START] = initNameStartChars(status); 237 UnicodeSet* digit = unisets::gUnicodeSets[unisets::DIGIT] = initDigits(status); 238 if (U_FAILURE(status)) { 239 return nullptr; 240 } 241 UnicodeSet* result = new UnicodeSet(); 242 if (result == nullptr) { 243 status = U_MEMORY_ALLOCATION_ERROR; 244 return nullptr; 245 }; 246 result->addAll(*nameStart); 247 result->addAll(*digit); 248 result->add(HYPHEN); 249 result->add(PERIOD); 250 result->freeze(); 251 return result; 252 } 253 254 UnicodeSet* initTextChars(UErrorCode& status) { 255 if (U_FAILURE(status)) { 256 return nullptr; 257 } 258 259 UnicodeSet* content = unisets::gUnicodeSets[unisets::CONTENT] = initContentChars(status); 260 UnicodeSet* whitespace = unisets::gUnicodeSets[unisets::WHITESPACE] = initWhitespace(status); 261 if (U_FAILURE(status)) { 262 return nullptr; 263 } 264 UnicodeSet* result = new UnicodeSet(); 265 if (result == nullptr) { 266 status = U_MEMORY_ALLOCATION_ERROR; 267 return nullptr; 268 }; 269 result->addAll(*content); 270 result->addAll(*whitespace); 271 result->add(PERIOD); 272 result->add(AT); 273 result->add(PIPE); 274 result->freeze(); 275 return result; 276 } 277 278 UnicodeSet* initQuotedChars(UErrorCode& status) { 279 if (U_FAILURE(status)) { 280 return nullptr; 281 } 282 283 unisets::gUnicodeSets[unisets::TEXT] = initTextChars(status); 284 if (U_FAILURE(status)) { 285 return nullptr; 286 } 287 UnicodeSet* result = new UnicodeSet(); 288 if (result == nullptr) { 289 status = U_MEMORY_ALLOCATION_ERROR; 290 return nullptr; 291 }; 292 // content and whitespace were initialized by `initTextChars()` 293 UnicodeSet* content = unisets::getImpl(unisets::CONTENT); 294 if (content == nullptr) { 295 status = U_MEMORY_ALLOCATION_ERROR; 296 return nullptr; 297 } 298 result->addAll(*content); 299 UnicodeSet* whitespace = unisets::getImpl(unisets::WHITESPACE); 300 if (whitespace == nullptr) { 301 status = U_MEMORY_ALLOCATION_ERROR; 302 return nullptr; 303 } 304 result->addAll(*whitespace); 305 result->add(PERIOD); 306 result->add(AT); 307 result->add(LEFT_CURLY_BRACE); 308 result->add(RIGHT_CURLY_BRACE); 309 result->freeze(); 310 return result; 311 } 312 313 UnicodeSet* initEscapableChars(UErrorCode& status) { 314 if (U_FAILURE(status)) { 315 return nullptr; 316 } 317 318 UnicodeSet* result = new UnicodeSet(); 319 if (result == nullptr) { 320 status = U_MEMORY_ALLOCATION_ERROR; 321 return nullptr; 322 } 323 result->add(PIPE); 324 result->add(BACKSLASH); 325 result->add(LEFT_CURLY_BRACE); 326 result->add(RIGHT_CURLY_BRACE); 327 result->freeze(); 328 return result; 329 } 330 331 namespace unisets { 332 333 UBool U_CALLCONV cleanupMF2ParseUniSets() { 334 for (int32_t i = 0; i < UNISETS_KEY_COUNT; i++) { 335 delete gUnicodeSets[i]; 336 gUnicodeSets[i] = nullptr; 337 } 338 gMF2ParseUniSetsInitOnce.reset(); 339 return true; 340 } 341 342 void U_CALLCONV initMF2ParseUniSets(UErrorCode& status) { 343 ucln_i18n_registerCleanup(UCLN_I18N_MF2_UNISETS, cleanupMF2ParseUniSets); 344 /* 345 Each of the init functions initializes the UnicodeSets 346 that it depends on. 347 348 initBidiControls (no dependencies) 349 350 initEscapableChars (no dependencies) 351 352 initNameChars depends on 353 initDigits 354 initNameStartChars depends on 355 initAlpha 356 357 initQuotedChars depends on 358 initTextChars depends on 359 initContentChars 360 initWhitespace 361 */ 362 gUnicodeSets[unisets::BIDI] = initBidiControls(status); 363 gUnicodeSets[unisets::NAME_CHAR] = initNameChars(status); 364 gUnicodeSets[unisets::QUOTED] = initQuotedChars(status); 365 gUnicodeSets[unisets::ESCAPABLE] = initEscapableChars(status); 366 367 if (U_FAILURE(status)) { 368 cleanupMF2ParseUniSets(); 369 } 370 } 371 372 const UnicodeSet* get(Key key, UErrorCode& status) { 373 umtx_initOnce(gMF2ParseUniSetsInitOnce, &initMF2ParseUniSets, status); 374 if (U_FAILURE(status)) { 375 return nullptr; 376 } 377 UnicodeSet* result = getImpl(key); 378 if (result == nullptr) { 379 status = U_MEMORY_ALLOCATION_ERROR; 380 } 381 return result; 382 } 383 384 } 385 386 // ------------------------------------- 387 // Predicates 388 389 /* 390 The following helper predicates should exactly match nonterminals in the MessageFormat 2 grammar: 391 392 `isContentChar()` : `content-char` 393 `isTextChar()` : `text-char` 394 `isAlpha()` : `ALPHA` 395 `isDigit()` : `DIGIT` 396 `isNameStart()` : `name-start` 397 `isNameChar()` : `name-char` 398 `isUnquotedStart()` : `unquoted-start` 399 `isQuotedChar()` : `quoted-char` 400 `isWhitespace()` : `s` 401 */ 402 403 bool Parser::isContentChar(UChar32 c) const { 404 return contentChars->contains(c); 405 } 406 407 // See `bidi` in the MF2 grammar 408 bool Parser::isBidiControl(UChar32 c) const { 409 return bidiControlChars->contains(c); 410 } 411 412 // See `ws` in the MessageFormat 2 grammar 413 bool Parser::isWhitespace(UChar32 c) const { 414 return whitespaceChars->contains(c); 415 } 416 417 bool Parser::isTextChar(UChar32 c) const { 418 return textChars->contains(c); 419 } 420 421 bool Parser::isAlpha(UChar32 c) const { 422 return alphaChars->contains(c); 423 } 424 425 bool Parser::isDigit(UChar32 c) const { 426 return digitChars->contains(c); 427 } 428 429 bool Parser::isNameStart(UChar32 c) const { 430 return nameStartChars->contains(c); 431 } 432 433 bool Parser::isNameChar(UChar32 c) const { 434 return nameChars->contains(c); 435 } 436 437 bool Parser::isUnquotedStart(UChar32 c) const { 438 return isNameChar(c); 439 } 440 441 bool Parser::isQuotedChar(UChar32 c) const { 442 return quotedChars->contains(c); 443 } 444 445 bool Parser::isEscapableChar(UChar32 c) const { 446 return escapableChars->contains(c); 447 } 448 449 // Returns true iff `c` can begin a `function` nonterminal 450 static bool isFunctionStart(UChar32 c) { 451 switch (c) { 452 case COLON: { 453 return true; 454 } 455 default: { 456 return false; 457 } 458 } 459 } 460 461 // Returns true iff `c` can begin an `annotation` nonterminal 462 static bool isAnnotationStart(UChar32 c) { 463 return isFunctionStart(c); 464 } 465 466 // Returns true iff `c` can begin a `literal` nonterminal 467 bool Parser::isLiteralStart(UChar32 c) const { 468 return (c == PIPE || isNameStart(c) || c == HYPHEN || isDigit(c)); 469 } 470 471 // Returns true iff `c` can begin a `key` nonterminal 472 bool Parser::isKeyStart(UChar32 c) const { 473 return (c == ASTERISK || isLiteralStart(c)); 474 } 475 476 bool Parser::isDeclarationStart() { 477 return (peek() == ID_LOCAL[0] 478 && inBounds(1) 479 && peek(1) == ID_LOCAL[1]) 480 || (peek() == ID_INPUT[0] 481 && inBounds(1) 482 && peek(1) == ID_INPUT[1]); 483 } 484 485 // ------------------------------------- 486 // Parsing functions 487 488 489 /* 490 TODO: Since handling the whitespace ambiguities needs to be repeated 491 in several different places and is hard to factor out, 492 it probably would be better to replace the parser with a lexer + parser 493 to separate tokenizing from parsing, which would simplify the code significantly. 494 This has the disadvantage that there is no token grammar for MessageFormat, 495 so one would have to be invented that isn't a component of the spec. 496 */ 497 498 /* 499 This is a recursive-descent scannerless parser that, 500 with a few exceptions, uses 1 character of lookahead. 501 502 This may not be an exhaustive list, as the additions of attributes and reserved 503 statements introduced several new ambiguities. 504 505 All but three of the exceptions involve ambiguities about the meaning of whitespace. 506 One ambiguity not involving whitespace is: 507 identifier -> namespace ":" name 508 vs. 509 identifier -> name 510 511 `namespace` and `name` can't be distinguished without arbitrary lookahead. 512 (For how this is handled, see parseIdentifier()) 513 514 The second ambiguity not involving whitespace is: 515 complex-message -> *(declaration[s]) complex-body 516 -> declaration *(declaration[s]) complex-body 517 -> declaration complex-body 518 -> reserved-statement complex-body 519 -> .foo {$x} .match // ... 520 When processing the '.', arbitrary lookahead is required to distinguish the 521 arbitrary-length unsupported keyword from `.match`. 522 (For how this is handled, see parseDeclarations()). 523 524 The third ambiguity not involving whitespace is: 525 complex-message -> *(declaration [s]) complex-body 526 -> reserved-statement *(declaration [s]) complex-body 527 -> reserved-statement complex-body 528 -> reserved-statement quotedPattern 529 -> reserved-keyword [s reserved-body] 1*([s] expression) quoted-pattern 530 -> reserved-keyword expression quoted-pattern 531 Example: .foo {1} {{1}} 532 533 Without lookahead, the opening '{' of the quoted pattern can't be distinguished 534 from the opening '{' of another expression in the unsupported statement. 535 (Though this only requires 1 character of lookahead.) 536 537 Otherwise: 538 539 There are at least seven ambiguities in the grammar that can't be resolved with finite 540 lookahead (since whitespace sequences can be arbitrarily long). They are resolved 541 with a form of backtracking (early exit). No state needs to be saved/restored 542 since whitespace doesn't affect the shape of the resulting parse tree, so it's 543 not true backtracking. 544 545 In addition, the grammar has been refactored 546 in a semantics-preserving way in some cases to make the code easier to structure. 547 548 First: variant = when 1*(s key) [s] pattern 549 Example: when k {a} 550 When reading the first space after 'k', it's ambiguous whether it's the 551 required space before another key, or the optional space before `pattern`. 552 (See comments in parseNonEmptyKeys()) 553 554 Second: expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}" 555 annotation = (function *(s option)) / reserved 556 Example: {:f } 557 When reading the first space after 'f', it's ambiguous whether it's the 558 required space before an option, or the optional trailing space after an options list 559 (in this case, the options list is empty). 560 (See comments in parseOptions() -- handling this case also meant it was easier to base 561 the code on a slightly refactored grammar, which should be semantically equivalent.) 562 563 Third: expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}" 564 annotation = (function *(s option)) / reserved 565 Example: {@a } 566 Similar to the previous case; see comments in parseReserved() 567 568 Fourth: expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}" 569 Example: {|foo| } 570 When reading the first space after the '|', it's ambiguous whether it's the required 571 space before an annotation, or the optional trailing space before the '}'. 572 (See comments in parseLiteralOrVariableWithAnnotation(); handling this case relies on 573 the same grammar refactoring as the second exception.) 574 575 Most functions match a non-terminal in the grammar, except as explained 576 in comments. 577 578 Fifth: matcher = match-statement 1*([s] variant) 579 -> match 1 *([s] selector) 1*([s] variant) 580 Example: match {42} * {{_}} 581 When reading the space after the first '}', it's unclear whether 582 it's the optional space before another selector, or the optional space 583 before a variant. 584 585 Sixth: annotation-expression = "{" [s] annotation *(s attribute) [s] "}" 586 -> "{" [s] function *(s attribute) [s] "}" 587 -> "{" [s] ":" identifier *(s option) *(s attribute) [s] "}" 588 -> "{" [s] ":" identifier s attribute *(s attribute) [s] "}" 589 590 Example: {:func @foo} 591 (Note: the same ambiguity is present with variable-expression and literal-expression) 592 593 Seventh: 594 595 596 When parsing the space, it's unclear whether it's the optional space before an 597 option, or the optional space before an attribute. 598 599 Unless otherwise noted in a comment, all helper functions that take 600 a `source` string, an `index` unsigned int, and an `errorCode` `UErrorCode` 601 have the precondition: 602 `index` < `len()` 603 and the postcondition: 604 `U_FAILURE(errorCode)` || `index < `len()` 605 */ 606 607 /* 608 No pre, no post. 609 A message may end with whitespace, so `index` may equal `len()` on exit. 610 */ 611 void Parser::parseRequiredWS(UErrorCode& errorCode) { 612 bool sawWhitespace = false; 613 614 // The loop exits either when we consume all the input, 615 // or when we see a non-whitespace character. 616 while (true) { 617 // Check if all input has been consumed 618 if (!inBounds()) { 619 // If whitespace isn't required -- or if we saw it already -- 620 // then the caller is responsible for checking this case and 621 // setting an error if necessary. 622 if (sawWhitespace) { 623 // Not an error. 624 return; 625 } 626 // Otherwise, whitespace is required; the end of the input has 627 // been reached without whitespace. This is an error. 628 ERROR(errorCode); 629 return; 630 } 631 632 // Input remains; process the next character if it's whitespace, 633 // exit the loop otherwise 634 if (isWhitespace(peek())) { 635 sawWhitespace = true; 636 // Increment line number in parse error if we consume a newline 637 maybeAdvanceLine(); 638 next(); 639 } else { 640 break; 641 } 642 } 643 644 if (!sawWhitespace) { 645 ERROR(errorCode); 646 } 647 } 648 649 void Parser::parseOptionalBidi() { 650 while (true) { 651 if (!inBounds()) { 652 return; 653 } 654 if (isBidiControl(peek())) { 655 next(); 656 } else { 657 break; 658 } 659 } 660 } 661 662 /* 663 No pre, no post, because a message may end with whitespace 664 Matches `s` in the MF2 grammar 665 */ 666 void Parser::parseRequiredWhitespace(UErrorCode& errorCode) { 667 parseOptionalBidi(); 668 parseRequiredWS(errorCode); 669 parseOptionalWhitespace(); 670 normalizedInput += SPACE; 671 } 672 673 /* 674 No pre, no post, for the same reason as `parseWhitespaceMaybeRequired()`. 675 */ 676 void Parser::parseOptionalWhitespace() { 677 while (true) { 678 if (!inBounds()) { 679 return; 680 } 681 auto cp = peek(); 682 if (isWhitespace(cp) || isBidiControl(cp)) { 683 maybeAdvanceLine(); 684 next(); 685 } else { 686 break; 687 } 688 } 689 } 690 691 // Consumes a single character, signaling an error if `peek()` != `c` 692 // No postcondition -- a message can end with a '}' token 693 void Parser::parseToken(UChar32 c, UErrorCode& errorCode) { 694 CHECK_BOUNDS(errorCode); 695 696 if (peek() == c) { 697 next(); 698 normalizedInput += c; 699 return; 700 } 701 // Next character didn't match -- error out 702 ERROR(errorCode); 703 } 704 705 /* 706 Consumes a fixed-length token, signaling an error if the token isn't a prefix of 707 the string beginning at `peek()` 708 No postcondition -- a message can end with a '}' token 709 */ 710 void Parser::parseToken(const std::u16string_view& token, UErrorCode& errorCode) { 711 U_ASSERT(inBounds()); 712 713 int32_t tokenPos = 0; 714 while (tokenPos < static_cast<int32_t>(token.length())) { 715 if (peek() != token[tokenPos]) { 716 ERROR(errorCode); 717 return; 718 } 719 normalizedInput += token[tokenPos]; 720 next(); 721 tokenPos++; 722 } 723 } 724 725 /* 726 Consumes optional whitespace, possibly advancing `index` to `index'`, 727 then consumes a fixed-length token (signaling an error if the token isn't a prefix of 728 the string beginning at `source[index']`), 729 then consumes optional whitespace again 730 */ 731 void Parser::parseTokenWithWhitespace(const std::u16string_view& token, UErrorCode& errorCode) { 732 // No need for error check or bounds check before parseOptionalWhitespace 733 parseOptionalWhitespace(); 734 // Establish precondition 735 CHECK_BOUNDS(errorCode); 736 parseToken(token, errorCode); 737 parseOptionalWhitespace(); 738 // Guarantee postcondition 739 CHECK_BOUNDS(errorCode); 740 } 741 742 /* 743 Consumes optional whitespace, possibly advancing `index` to `index'`, 744 then consumes a single character (signaling an error if it doesn't match 745 `source[index']`), 746 then consumes optional whitespace again 747 */ 748 void Parser::parseTokenWithWhitespace(UChar32 c, UErrorCode& errorCode) { 749 // No need for error check or bounds check before parseOptionalWhitespace() 750 parseOptionalWhitespace(); 751 // Establish precondition 752 CHECK_BOUNDS(errorCode); 753 parseToken(c, errorCode); 754 parseOptionalWhitespace(); 755 // Guarantee postcondition 756 CHECK_BOUNDS(errorCode); 757 } 758 759 /* 760 Consumes a possibly-empty sequence of name-chars. Appends to `str` 761 and returns `str`. 762 */ 763 UnicodeString Parser::parseNameChars(UnicodeString& str, UErrorCode& errorCode) { 764 if (U_FAILURE(errorCode)) { 765 return {}; 766 } 767 768 while (isNameChar(peek())) { 769 UChar32 c = peek(); 770 str += c; 771 normalizedInput += c; 772 next(); 773 if (!inBounds()) { 774 ERROR(errorCode); 775 break; 776 } 777 } 778 779 return str; 780 } 781 782 /* 783 Consumes a non-empty sequence of `name-char`s, the first of which is 784 also a `name-start`. 785 that begins with a character `start` such that `isNameStart(start)`. 786 787 Returns this sequence. 788 789 (Matches the `name` nonterminal in the grammar.) 790 */ 791 UnicodeString Parser::parseName(UErrorCode& errorCode) { 792 UnicodeString name; 793 794 U_ASSERT(inBounds()); 795 796 if (!(isNameStart(peek()) || isBidiControl(peek()))) { 797 ERROR(errorCode); 798 return name; 799 } 800 801 // name = [bidi] name-start *name-char [bidi] 802 803 // [bidi] 804 parseOptionalBidi(); 805 806 // name-start *name-char 807 parseNameChars(name, errorCode); 808 809 // [bidi] 810 parseOptionalBidi(); 811 812 return name; 813 } 814 815 /* 816 Consumes a '$' followed by a `name`, returning a VariableName 817 with `name` as its name 818 819 (Matches the `variable` nonterminal in the grammar.) 820 */ 821 VariableName Parser::parseVariableName(UErrorCode& errorCode) { 822 VariableName result; 823 824 U_ASSERT(inBounds()); 825 826 parseToken(DOLLAR, errorCode); 827 if (!inBounds()) { 828 ERROR(errorCode); 829 return result; 830 } 831 return VariableName(parseName(errorCode)); 832 } 833 834 /* 835 Corresponds to the `identifier` nonterminal in the grammar 836 */ 837 UnicodeString Parser::parseIdentifier(UErrorCode& errorCode) { 838 U_ASSERT(inBounds()); 839 840 UnicodeString result; 841 // The following is a hack to get around ambiguity in the grammar: 842 // identifier -> namespace ":" name 843 // vs. 844 // identifier -> name 845 // can't be distinguished without arbitrary lookahead. 846 // Instead, we treat the production as: 847 // identifier -> namespace *(":"name) 848 // and then check for multiple colons. 849 850 // Parse namespace 851 result += parseName(errorCode); 852 int32_t firstColon = -1; 853 while (inBounds() && peek() == COLON) { 854 // Parse ':' separator 855 if (firstColon == -1) { 856 firstColon = index; 857 } 858 parseToken(COLON, errorCode); 859 result += COLON; 860 // Check for message ending with something like "foo:" 861 if (!inBounds()) { 862 ERROR(errorCode); 863 } else { 864 // Parse name part 865 result += parseName(errorCode); 866 } 867 } 868 869 // If there's at least one ':', scan from the first ':' 870 // to the end of the name to check for multiple ':'s 871 if (firstColon != -1) { 872 for (int32_t i = firstColon + 1; i < result.length(); i++) { 873 if (result[i] == COLON) { 874 ERROR_AT(errorCode, i); 875 return {}; 876 } 877 } 878 } 879 880 return result; 881 } 882 883 /* 884 Consumes a reference to a function, matching the ": identifier" 885 in the `function` nonterminal in the grammar. 886 887 Returns the function name. 888 */ 889 FunctionName Parser::parseFunction(UErrorCode& errorCode) { 890 U_ASSERT(inBounds()); 891 if (!isFunctionStart(peek())) { 892 ERROR(errorCode); 893 return FunctionName(); 894 } 895 896 normalizedInput += peek(); 897 next(); // Consume the function start character 898 if (!inBounds()) { 899 ERROR(errorCode); 900 return FunctionName(); 901 } 902 return parseIdentifier(errorCode); 903 } 904 905 906 /* 907 Precondition: peek() == BACKSLASH 908 909 Consume an escaped character. 910 Corresponds to `escaped-char` in the grammar. 911 912 No postcondition (a message can end with an escaped char) 913 */ 914 UnicodeString Parser::parseEscapeSequence(UErrorCode& errorCode) { 915 U_ASSERT(inBounds()); 916 U_ASSERT(peek() == BACKSLASH); 917 normalizedInput += BACKSLASH; 918 next(); // Skip the initial backslash 919 UnicodeString str; 920 if (inBounds()) { 921 // Expect a '{', '|' or '}' 922 switch (peek()) { 923 case LEFT_CURLY_BRACE: 924 case RIGHT_CURLY_BRACE: 925 case PIPE: 926 case BACKSLASH: { 927 /* Append to the output string */ 928 str += peek(); 929 /* Update normalizedInput */ 930 normalizedInput += peek(); 931 /* Consume the character */ 932 next(); 933 return str; 934 } 935 default: { 936 // No other characters are allowed here 937 break; 938 } 939 } 940 } 941 // If control reaches here, there was an error 942 ERROR(errorCode); 943 return str; 944 } 945 946 947 /* 948 Consume and return a quoted literal, matching the `literal` nonterminal in the grammar. 949 */ 950 Literal Parser::parseQuotedLiteral(UErrorCode& errorCode) { 951 bool error = false; 952 953 UnicodeString contents; 954 if (U_SUCCESS(errorCode)) { 955 // Parse the opening '|' 956 parseToken(PIPE, errorCode); 957 if (!inBounds()) { 958 ERROR(errorCode); 959 error = true; 960 } else { 961 // Parse the contents 962 bool done = false; 963 while (!done) { 964 if (peek() == BACKSLASH) { 965 contents += parseEscapeSequence(errorCode); 966 } else if (isQuotedChar(peek())) { 967 contents += peek(); 968 // Handle cases like: 969 // |}{| -- we want to escape everywhere that 970 // can be escaped, to make round-trip checking 971 // easier -- so this case normalizes to 972 // |\}\{| 973 if (isEscapableChar(peek())) { 974 normalizedInput += BACKSLASH; 975 } 976 normalizedInput += peek(); 977 next(); // Consume this character 978 maybeAdvanceLine(); 979 } else { 980 // Assume the sequence of literal characters ends here 981 done = true; 982 } 983 if (!inBounds()) { 984 ERROR(errorCode); 985 error = true; 986 break; 987 } 988 } 989 } 990 } 991 992 if (error) { 993 return {}; 994 } 995 996 // Parse the closing '|' 997 parseToken(PIPE, errorCode); 998 999 return Literal(true, contents); 1000 } 1001 1002 // Parse (1*DIGIT) 1003 UnicodeString Parser::parseDigits(UErrorCode& errorCode) { 1004 if (U_FAILURE(errorCode)) { 1005 return {}; 1006 } 1007 1008 U_ASSERT(isDigit(peek())); 1009 1010 UnicodeString contents; 1011 do { 1012 contents += peek(); 1013 normalizedInput += peek(); 1014 next(); 1015 if (!inBounds()) { 1016 ERROR(errorCode); 1017 return {}; 1018 } 1019 } while (isDigit(peek())); 1020 1021 return contents; 1022 } 1023 /* 1024 Consume and return an unquoted literal, matching the `unquoted` nonterminal in the grammar. 1025 */ 1026 Literal Parser::parseUnquotedLiteral(UErrorCode& errorCode) { 1027 if (U_FAILURE(errorCode)) { 1028 return {}; 1029 } 1030 // unquoted-literal = 1*name-char 1031 1032 if (!(isNameChar(peek()))) { 1033 ERROR(errorCode); 1034 return {}; 1035 } 1036 1037 UnicodeString contents; 1038 parseNameChars(contents, errorCode); 1039 return Literal(false, contents); 1040 } 1041 1042 /* 1043 Consume and return a literal, matching the `literal` nonterminal in the grammar. 1044 */ 1045 Literal Parser::parseLiteral(UErrorCode& errorCode) { 1046 Literal result; 1047 if (!inBounds()) { 1048 ERROR(errorCode); 1049 } else { 1050 if (peek() == PIPE) { 1051 result = parseQuotedLiteral(errorCode); 1052 } else { 1053 result = parseUnquotedLiteral(errorCode); 1054 } 1055 // Guarantee postcondition 1056 if (!inBounds()) { 1057 ERROR(errorCode); 1058 } 1059 } 1060 1061 return result; 1062 } 1063 1064 /* 1065 Consume a @name-value pair, matching the `attribute` nonterminal in the grammar. 1066 1067 Adds the option to `options` 1068 */ 1069 template<class T> 1070 void Parser::parseAttribute(AttributeAdder<T>& attrAdder, UErrorCode& errorCode) { 1071 U_ASSERT(inBounds()); 1072 1073 U_ASSERT(peek() == AT); 1074 // Consume the '@' 1075 parseToken(AT, errorCode); 1076 1077 // Parse LHS 1078 UnicodeString lhs = parseIdentifier(errorCode); 1079 1080 // Prepare to "backtrack" to resolve ambiguity 1081 // about whether whitespace precedes another 1082 // attribute, or the '=' sign 1083 int32_t savedIndex = index; 1084 parseOptionalWhitespace(); 1085 1086 Operand rand; 1087 if (peek() == EQUALS) { 1088 // Parse '=' 1089 parseTokenWithWhitespace(EQUALS, errorCode); 1090 1091 UnicodeString rhsStr; 1092 // Parse RHS, which must be a literal 1093 // attribute = "@" identifier [o "=" o literal] 1094 rand = Operand(parseLiteral(errorCode)); 1095 } else { 1096 // attribute -> "@" identifier [[s] "=" [s]] 1097 // Use null operand, which `rand` is already set to 1098 // "Backtrack" by restoring the whitespace (if there was any) 1099 index = savedIndex; 1100 } 1101 1102 attrAdder.addAttribute(lhs, std::move(Operand(rand)), errorCode); 1103 } 1104 1105 /* 1106 Consume a name-value pair, matching the `option` nonterminal in the grammar. 1107 1108 Adds the option to `optionList` 1109 */ 1110 template<class T> 1111 void Parser::parseOption(OptionAdder<T>& addOption, UErrorCode& errorCode) { 1112 U_ASSERT(inBounds()); 1113 1114 // Parse LHS 1115 UnicodeString lhs = parseIdentifier(errorCode); 1116 1117 // Parse '=' 1118 parseTokenWithWhitespace(EQUALS, errorCode); 1119 1120 UnicodeString rhsStr; 1121 Operand rand; 1122 // Parse RHS, which is either a literal or variable 1123 switch (peek()) { 1124 case DOLLAR: { 1125 rand = Operand(parseVariableName(errorCode)); 1126 break; 1127 } 1128 default: { 1129 // Must be a literal 1130 rand = Operand(parseLiteral(errorCode)); 1131 break; 1132 } 1133 } 1134 U_ASSERT(!rand.isNull()); 1135 1136 // Finally, add the key=value mapping 1137 // Use a local error code, check for duplicate option error and 1138 // record it as with other errors 1139 UErrorCode status = U_ZERO_ERROR; 1140 addOption.addOption(lhs, std::move(rand), status); 1141 if (U_FAILURE(status)) { 1142 U_ASSERT(status == U_MF_DUPLICATE_OPTION_NAME_ERROR); 1143 errors.setDuplicateOptionName(errorCode); 1144 } 1145 } 1146 1147 /* 1148 Note: there are multiple overloads of parseOptions() for parsing 1149 options within markup, vs. within an expression, vs. parsing 1150 attributes. This should be refactored. TODO 1151 */ 1152 1153 /* 1154 Consume optional whitespace followed by a sequence of options 1155 (possibly empty), separated by whitespace 1156 */ 1157 template <class T> 1158 void Parser::parseOptions(OptionAdder<T>& addOption, UErrorCode& errorCode) { 1159 // Early exit if out of bounds -- no more work is possible 1160 CHECK_BOUNDS(errorCode); 1161 1162 /* 1163 Arbitrary lookahead is required to parse option lists. To see why, consider 1164 these rules from the grammar: 1165 1166 expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}" 1167 annotation = (function *(s option)) / reserved 1168 1169 And this example: 1170 {:foo } 1171 1172 Derivation: 1173 expression -> "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}" 1174 -> "{" [s] annotation [s] "}" 1175 -> "{" [s] ((function *(s option)) / reserved) [s] "}" 1176 -> "{" [s] function *(s option) [s] "}" 1177 1178 In this example, knowing whether to expect a '}' or the start of another option 1179 after the whitespace would require arbitrary lookahead -- in other words, which 1180 rule should we apply? 1181 *(s option) -> s option *(s option) 1182 or 1183 *(s option) -> 1184 1185 The same would apply to the example {:foo k=v } (note the trailing space after "v"). 1186 1187 This is addressed using a form of backtracking and (to make the backtracking easier 1188 to apply) a slight refactoring to the grammar. 1189 1190 This code is written as if the grammar is: 1191 expression = "{" [s] (((literal / variable) ([s] / [s annotation])) / annotation) "}" 1192 annotation = (function *(s option) [s]) / (reserved [s]) 1193 1194 Parsing the `*(s option) [s]` sequence can be done within `parseOptions()`, meaning 1195 that `parseExpression()` can safely require a '}' after `parseOptions()` finishes. 1196 1197 Note that when "backtracking" really just means early exit, since only whitespace 1198 is involved and there's no state to save. 1199 1200 There is a separate but similar ambiguity as to whether the space precedes 1201 an option or an attribute. 1202 */ 1203 1204 while(true) { 1205 // If the next character is not whitespace, that means we've already 1206 // parsed the entire options list (which may have been empty) and there's 1207 // no trailing whitespace. In that case, exit. 1208 if (!isWhitespace(peek())) { 1209 break; 1210 } 1211 int32_t firstWhitespace = index; 1212 1213 // In any case other than an empty options list, there must be at least 1214 // one whitespace character. 1215 parseRequiredWhitespace(errorCode); 1216 // Restore precondition 1217 CHECK_BOUNDS(errorCode); 1218 1219 // If a name character follows, then at least one more option remains 1220 // in the list. 1221 // Otherwise, we've consumed all the options and any trailing whitespace, 1222 // and can exit. 1223 // Note that exiting is sort of like backtracking: "(s option)" doesn't apply, 1224 // so we back out to [s]. 1225 if (!isNameStart(peek())) { 1226 // We've consumed all the options (meaning that either we consumed non-empty 1227 // whitespace, or consumed at least one option.) 1228 // Done. 1229 // Remove the required whitespace from normalizedInput 1230 normalizedInput.truncate(normalizedInput.length() - 1); 1231 // "Backtrack" so as to leave the optional whitespace there 1232 // when parsing attributes 1233 index = firstWhitespace; 1234 break; 1235 } 1236 parseOption(addOption, errorCode); 1237 } 1238 } 1239 1240 /* 1241 Consume optional whitespace followed by a sequence of attributes 1242 (possibly empty), separated by whitespace 1243 */ 1244 template<class T> 1245 void Parser::parseAttributes(AttributeAdder<T>& attrAdder, UErrorCode& errorCode) { 1246 1247 // Early exit if out of bounds -- no more work is possible 1248 if (!inBounds()) { 1249 ERROR(errorCode); 1250 return; 1251 } 1252 1253 /* 1254 Arbitrary lookahead is required to parse attribute lists, similarly to option lists. 1255 (See comment in parseOptions()). 1256 */ 1257 1258 while(true) { 1259 // If the next character is not whitespace, that means we've already 1260 // parsed the entire attributes list (which may have been empty) and there's 1261 // no trailing whitespace. In that case, exit. 1262 if (!isWhitespace(peek())) { 1263 break; 1264 } 1265 1266 // In any case other than an empty attributes list, there must be at least 1267 // one whitespace character. 1268 parseRequiredWhitespace(errorCode); 1269 // Restore precondition 1270 if (!inBounds()) { 1271 ERROR(errorCode); 1272 break; 1273 } 1274 1275 // If an '@' follows, then at least one more attribute remains 1276 // in the list. 1277 // Otherwise, we've consumed all the attributes and any trailing whitespace, 1278 // and can exit. 1279 // Note that exiting is sort of like backtracking: "(s attributes)" doesn't apply, 1280 // so we back out to [s]. 1281 if (peek() != AT) { 1282 // We've consumed all the attributes (meaning that either we consumed non-empty 1283 // whitespace, or consumed at least one attribute.) 1284 // Done. 1285 // Remove the whitespace from normalizedInput 1286 normalizedInput.truncate(normalizedInput.length() - 1); 1287 break; 1288 } 1289 parseAttribute(attrAdder, errorCode); 1290 } 1291 } 1292 1293 /* 1294 Consume a function call, matching the `annotation` 1295 nonterminal in the grammar 1296 1297 Returns an `Operator` representing this (a reserved is a parse error) 1298 */ 1299 Operator Parser::parseAnnotation(UErrorCode& status) { 1300 U_ASSERT(inBounds()); 1301 Operator::Builder ratorBuilder(status); 1302 if (U_FAILURE(status)) { 1303 return {}; 1304 } 1305 if (isFunctionStart(peek())) { 1306 // Consume the function name 1307 FunctionName func = parseFunction(status); 1308 ratorBuilder.setFunctionName(std::move(func)); 1309 1310 OptionAdder<Operator::Builder> addOptions(ratorBuilder); 1311 // Consume the options (which may be empty) 1312 parseOptions(addOptions, status); 1313 } else { 1314 ERROR(status); 1315 } 1316 return ratorBuilder.build(status); 1317 } 1318 1319 /* 1320 Consume a literal or variable (depending on `isVariable`), 1321 followed by either required whitespace followed by an annotation, 1322 or optional whitespace. 1323 */ 1324 void Parser::parseLiteralOrVariableWithAnnotation(bool isVariable, 1325 Expression::Builder& builder, 1326 UErrorCode& status) { 1327 CHECK_ERROR(status); 1328 1329 U_ASSERT(inBounds()); 1330 1331 Operand rand; 1332 if (isVariable) { 1333 rand = Operand(parseVariableName(status)); 1334 } else { 1335 rand = Operand(parseLiteral(status)); 1336 } 1337 1338 builder.setOperand(std::move(rand)); 1339 1340 /* 1341 Parsing a literal or variable with an optional annotation requires arbitrary lookahead. 1342 To see why, consider this rule from the grammar: 1343 1344 expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}" 1345 1346 And this example: 1347 1348 {|foo| } 1349 1350 Derivation: 1351 expression -> "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}" 1352 -> "{" [s] ((literal / variable) [s annotation]) [s] "}" 1353 -> "{" [s] (literal [s annotation]) [s] "}" 1354 1355 When reading the ' ' after the second '|', it's ambiguous whether that's the required 1356 space before an annotation, or the optional space before the '}'. 1357 1358 To make this ambiguity easier to handle, this code is based on the same grammar 1359 refactoring for the `expression` nonterminal that `parseOptions()` relies on. See 1360 the comment in `parseOptions()` for details. 1361 */ 1362 1363 if (isWhitespace(peek())) { 1364 int32_t firstWhitespace = index; 1365 1366 // If the next character is whitespace, either [s annotation] or [s] applies 1367 // (the character is either the required space before an annotation, or optional 1368 // trailing space after the literal or variable). It's still ambiguous which 1369 // one does apply. 1370 parseOptionalWhitespace(); 1371 // Restore precondition 1372 CHECK_BOUNDS(status); 1373 1374 // This next check resolves the ambiguity between [s annotation] and [s] 1375 bool isSAnnotation = isAnnotationStart(peek()); 1376 1377 if (isSAnnotation) { 1378 normalizedInput += SPACE; 1379 } 1380 1381 if (isSAnnotation) { 1382 // The previously consumed whitespace precedes an annotation 1383 builder.setOperator(parseAnnotation(status)); 1384 } else { 1385 // Either there's a right curly brace (will be consumed by the caller), 1386 // or there's an error and the trailing whitespace should be 1387 // handled by the caller. However, this is not an error 1388 // here because we're just parsing `literal [s annotation]`. 1389 index = firstWhitespace; 1390 } 1391 } else { 1392 // Either there was never whitespace, or 1393 // the previously consumed whitespace is the optional trailing whitespace; 1394 // either the next character is '}' or the error will be handled by parseExpression. 1395 // Do nothing, since the operand was already set 1396 } 1397 1398 // At the end of this code, the next character should either be '}', 1399 // whitespace followed by a '}', 1400 // or end-of-input 1401 } 1402 1403 /* 1404 Consume an expression, matching the `expression` nonterminal in the grammar 1405 */ 1406 1407 static void exprFallback(Expression::Builder& exprBuilder) { 1408 // Construct a literal consisting just of The U+FFFD REPLACEMENT CHARACTER 1409 // per https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md#fallback-resolution 1410 exprBuilder.setOperand(Operand(Literal(false, UnicodeString(REPLACEMENT)))); 1411 } 1412 1413 static Expression exprFallback(UErrorCode& status) { 1414 Expression result; 1415 if (U_SUCCESS(status)) { 1416 Expression::Builder exprBuilder(status); 1417 if (U_SUCCESS(status)) { 1418 // Construct a literal consisting just of The U+FFFD REPLACEMENT CHARACTER 1419 // per https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md#fallback-resolution 1420 exprBuilder.setOperand(Operand(Literal(false, UnicodeString(REPLACEMENT)))); 1421 UErrorCode status = U_ZERO_ERROR; 1422 result = exprBuilder.build(status); 1423 // An operand was set, so there can't be an error 1424 U_ASSERT(U_SUCCESS(status)); 1425 } 1426 } 1427 return result; 1428 } 1429 1430 Expression Parser::parseExpression(UErrorCode& status) { 1431 if (U_FAILURE(status)) { 1432 return {}; 1433 } 1434 1435 // Early return if out of input -- no more work is possible 1436 U_ASSERT(inBounds()); 1437 1438 // Parse opening brace 1439 parseToken(LEFT_CURLY_BRACE, status); 1440 // Optional whitespace after opening brace 1441 parseOptionalWhitespace(); 1442 1443 Expression::Builder exprBuilder(status); 1444 // Restore precondition 1445 if (!inBounds()) { 1446 exprFallback(exprBuilder); 1447 } else { 1448 // literal '|', variable '$' or annotation 1449 switch (peek()) { 1450 case PIPE: { 1451 // Quoted literal 1452 parseLiteralOrVariableWithAnnotation(false, exprBuilder, status); 1453 break; 1454 } 1455 case DOLLAR: { 1456 // Variable 1457 parseLiteralOrVariableWithAnnotation(true, exprBuilder, status); 1458 break; 1459 } 1460 default: { 1461 if (isAnnotationStart(peek())) { 1462 Operator rator = parseAnnotation(status); 1463 exprBuilder.setOperator(std::move(rator)); 1464 } else if (isUnquotedStart(peek())) { 1465 // Unquoted literal 1466 parseLiteralOrVariableWithAnnotation(false, exprBuilder, status); 1467 } else { 1468 // Not a literal, variable or annotation -- error out 1469 ERROR(status); 1470 exprFallback(exprBuilder); 1471 break; 1472 } 1473 break; 1474 } 1475 } 1476 } 1477 1478 // Parse attributes 1479 AttributeAdder<Expression::Builder> attrAdder(exprBuilder); 1480 parseAttributes(attrAdder, status); 1481 1482 // Parse optional space 1483 // (the last [s] in e.g. "{" [s] literal [s annotation] *(s attribute) [s] "}") 1484 parseOptionalWhitespace(); 1485 1486 // Either an operand or operator (or both) must have been set already, 1487 // so there can't be an error 1488 UErrorCode localStatus = U_ZERO_ERROR; 1489 Expression result = exprBuilder.build(localStatus); 1490 U_ASSERT(U_SUCCESS(localStatus)); 1491 1492 // Check for end-of-input and missing '}' 1493 if (!inBounds()) { 1494 ERROR(status); 1495 } else { 1496 // Otherwise, it's safe to check for the '}' 1497 parseToken(RIGHT_CURLY_BRACE, status); 1498 } 1499 return result; 1500 } 1501 1502 /* 1503 Parse a .local declaration, matching the `local-declaration` 1504 production in the grammar 1505 */ 1506 void Parser::parseLocalDeclaration(UErrorCode& status) { 1507 // End-of-input here would be an error; even empty 1508 // declarations must be followed by a body 1509 CHECK_BOUNDS(status); 1510 1511 parseToken(ID_LOCAL, status); 1512 parseRequiredWhitespace(status); 1513 1514 // Restore precondition 1515 CHECK_BOUNDS(status); 1516 VariableName lhs = parseVariableName(status); 1517 parseTokenWithWhitespace(EQUALS, status); 1518 // Restore precondition before calling parseExpression() 1519 CHECK_BOUNDS(status); 1520 1521 Expression rhs = parseExpression(status); 1522 1523 // Add binding from lhs to rhs, unless there was an error 1524 // (This ensures that if there was a correct lhs but a 1525 // parse error in rhs, the fallback for uses of the 1526 // lhs will be its own name rather than the rhs) 1527 /* This affects the behavior of this test case, which the spec 1528 is ambiguous about: 1529 1530 .local $bar {|foo|} {{{$bar}}} 1531 1532 Should `$bar` still be bound to a value although 1533 its declaration is syntactically incorrect (missing the '=')? 1534 This code says no, but it needs to change if 1535 https://github.com/unicode-org/message-format-wg/issues/703 1536 is resolved differently. 1537 */ 1538 CHECK_ERROR(status); 1539 if (!errors.hasSyntaxError()) { 1540 dataModel.addBinding(Binding(std::move(lhs), std::move(rhs)), status); 1541 // Check if status is U_DUPLICATE_DECLARATION_ERROR 1542 // and add that as an internal error if so 1543 if (status == U_MF_DUPLICATE_DECLARATION_ERROR) { 1544 status = U_ZERO_ERROR; 1545 errors.addError(StaticErrorType::DuplicateDeclarationError, status); 1546 } 1547 } 1548 } 1549 1550 /* 1551 Parse an .input declaration, matching the `local-declaration` 1552 production in the grammar 1553 */ 1554 void Parser::parseInputDeclaration(UErrorCode& status) { 1555 // End-of-input here would be an error; even empty 1556 // declarations must be followed by a body 1557 CHECK_BOUNDS(status); 1558 1559 parseToken(ID_INPUT, status); 1560 parseOptionalWhitespace(); 1561 1562 // Restore precondition before calling parseExpression() 1563 CHECK_BOUNDS(status); 1564 1565 // Save the index for error diagnostics 1566 int32_t exprIndex = index; 1567 Expression rhs = parseExpression(status); 1568 1569 // Here we have to check that the rhs is a variable-expression 1570 if (!rhs.getOperand().isVariable()) { 1571 // This case is a syntax error; report it at the beginning 1572 // of the expression 1573 ERROR_AT(status, exprIndex); 1574 return; 1575 } 1576 1577 VariableName lhs = rhs.getOperand().asVariable(); 1578 1579 // Add binding from lhs to rhs 1580 // This just adds a new local variable that shadows the message 1581 // argument referred to, which is harmless. 1582 // When evaluating the RHS, the new local is not in scope 1583 // and the message argument will be correctly referred to. 1584 CHECK_ERROR(status); 1585 if (!errors.hasSyntaxError()) { 1586 dataModel.addBinding(Binding::input(std::move(lhs), std::move(rhs), status), status); 1587 // Check if status is U_MF_DUPLICATE_DECLARATION_ERROR 1588 // and add that as an internal error if so 1589 if (status == U_MF_DUPLICATE_DECLARATION_ERROR) { 1590 status = U_ZERO_ERROR; 1591 errors.addError(StaticErrorType::DuplicateDeclarationError, status); 1592 } 1593 } 1594 } 1595 1596 /* 1597 Consume a possibly-empty sequence of declarations separated by whitespace; 1598 each declaration matches the `declaration` nonterminal in the grammar 1599 1600 Builds up an environment representing those declarations 1601 */ 1602 void Parser::parseDeclarations(UErrorCode& status) { 1603 // End-of-input here would be an error; even empty 1604 // declarations must be followed by a body 1605 CHECK_BOUNDS(status); 1606 1607 while (peek() == PERIOD) { 1608 CHECK_BOUNDS_1(status); 1609 if (peek(1) == ID_LOCAL[1]) { 1610 parseLocalDeclaration(status); 1611 } else if (peek(1) == ID_INPUT[1]) { 1612 parseInputDeclaration(status); 1613 } else { 1614 // Done parsing declarations 1615 break; 1616 } 1617 1618 // Avoid looping infinitely 1619 CHECK_ERROR(status); 1620 1621 parseOptionalWhitespace(); 1622 // Restore precondition 1623 CHECK_BOUNDS(status); 1624 } 1625 } 1626 1627 /* 1628 Consume a text character 1629 matching the `text-char` nonterminal in the grammar 1630 1631 No postcondition (a message can end with a text-char) 1632 */ 1633 UnicodeString Parser::parseTextChar(UErrorCode& status) { 1634 UnicodeString str; 1635 if (!inBounds() || !(isTextChar(peek()))) { 1636 // Error -- text-char is expected here 1637 ERROR(status); 1638 } else { 1639 // See comment in parseQuotedLiteral() 1640 if (isEscapableChar(peek())) { 1641 normalizedInput += BACKSLASH; 1642 } 1643 normalizedInput += peek(); 1644 str += peek(); 1645 next(); 1646 maybeAdvanceLine(); 1647 } 1648 return str; 1649 } 1650 1651 /* 1652 Consume an `nmtoken`, `literal`, or the string "*", matching 1653 the `key` nonterminal in the grammar 1654 */ 1655 Key Parser::parseKey(UErrorCode& status) { 1656 U_ASSERT(inBounds()); 1657 1658 Key k; // wildcard by default 1659 // Literal | '*' 1660 switch (peek()) { 1661 case ASTERISK: { 1662 next(); 1663 normalizedInput += ASTERISK; 1664 // Guarantee postcondition 1665 if (!inBounds()) { 1666 ERROR(status); 1667 return k; 1668 } 1669 break; 1670 } 1671 default: { 1672 // Literal 1673 k = Key(parseLiteral(status)); 1674 break; 1675 } 1676 } 1677 return k; 1678 } 1679 1680 /* 1681 Consume a non-empty sequence of `key`s separated by whitespace 1682 1683 Takes ownership of `keys` 1684 */ 1685 SelectorKeys Parser::parseNonEmptyKeys(UErrorCode& status) { 1686 SelectorKeys result; 1687 1688 if (U_FAILURE(status)) { 1689 return result; 1690 } 1691 1692 U_ASSERT(inBounds()); 1693 1694 /* 1695 Arbitrary lookahead is required to parse key lists. To see why, consider 1696 this rule from the grammar: 1697 1698 variant = key *(s key) [s] quoted-pattern 1699 1700 And this example: 1701 when k1 k2 {a} 1702 1703 Derivation: 1704 variant -> key *(s key) [s] quoted-pattern 1705 -> key s key *(s key) quoted-pattern 1706 1707 After matching ' ' to `s` and 'k2' to `key`, it would require arbitrary lookahead 1708 to know whether to expect the start of a pattern or the start of another key. 1709 In other words: is the second whitespace sequence the required space in *(s key), 1710 or the optional space in [s] quoted-pattern? 1711 1712 This is addressed using "backtracking" (similarly to `parseOptions()`). 1713 */ 1714 1715 SelectorKeys::Builder keysBuilder(status); 1716 if (U_FAILURE(status)) { 1717 return result; 1718 } 1719 1720 // Since the first key is required, it's simplest to parse it separately. 1721 keysBuilder.add(parseKey(status), status); 1722 1723 // Restore precondition 1724 if (!inBounds()) { 1725 ERROR(status); 1726 return result; 1727 } 1728 1729 // We've seen at least one whitespace-key pair, so now we can parse 1730 // *(s key) [s] 1731 while (peek() != LEFT_CURLY_BRACE || isWhitespace(peek()) || isBidiControl(peek())) { 1732 bool wasWhitespace = isWhitespace(peek()) || isBidiControl(peek()); 1733 parseRequiredWhitespace(status); 1734 if (!wasWhitespace) { 1735 // Avoid infinite loop when parsing something like: 1736 // when * @{!... 1737 next(); 1738 } 1739 1740 // Restore precondition 1741 if (!inBounds()) { 1742 ERROR(status); 1743 return result; 1744 } 1745 1746 // At this point, it's ambiguous whether we are inside (s key) or [s]. 1747 // This check resolves that ambiguity. 1748 if (peek() == LEFT_CURLY_BRACE) { 1749 // A pattern follows, so what we just parsed was the optional 1750 // trailing whitespace. All the keys have been parsed. 1751 1752 // Unpush the whitespace from `normalizedInput` 1753 normalizedInput.truncate(normalizedInput.length() - 1); 1754 break; 1755 } 1756 keysBuilder.add(parseKey(status), status); 1757 } 1758 1759 return keysBuilder.build(status); 1760 } 1761 1762 Pattern Parser::parseQuotedPattern(UErrorCode& status) { 1763 U_ASSERT(inBounds()); 1764 1765 parseToken(LEFT_CURLY_BRACE, status); 1766 parseToken(LEFT_CURLY_BRACE, status); 1767 Pattern p = parseSimpleMessage(status); 1768 parseToken(RIGHT_CURLY_BRACE, status); 1769 parseToken(RIGHT_CURLY_BRACE, status); 1770 return p; 1771 } 1772 1773 /* 1774 Consume a `placeholder`, matching the nonterminal in the grammar 1775 No postcondition (a markup can end a message) 1776 */ 1777 Markup Parser::parseMarkup(UErrorCode& status) { 1778 U_ASSERT(inBounds(1)); 1779 1780 U_ASSERT(peek() == LEFT_CURLY_BRACE); 1781 1782 Markup::Builder builder(status); 1783 if (U_FAILURE(status)) { 1784 return {}; 1785 } 1786 1787 // Consume the '{' 1788 next(); 1789 normalizedInput += LEFT_CURLY_BRACE; 1790 parseOptionalWhitespace(); 1791 bool closing = false; 1792 switch (peek()) { 1793 case NUMBER_SIGN: { 1794 // Open or standalone; consume the '#' 1795 normalizedInput += peek(); 1796 next(); 1797 break; 1798 } 1799 case SLASH: { 1800 // Closing 1801 normalizedInput += peek(); 1802 closing = true; 1803 next(); 1804 break; 1805 } 1806 default: { 1807 ERROR(status); 1808 return {}; 1809 } 1810 } 1811 1812 // Parse the markup identifier 1813 builder.setName(parseIdentifier(status)); 1814 1815 // Parse the options, which must begin with a ' ' 1816 // if present 1817 if (inBounds() && (isWhitespace(peek()) || isBidiControl(peek()))) { 1818 OptionAdder<Markup::Builder> optionAdder(builder); 1819 parseOptions(optionAdder, status); 1820 } 1821 1822 // Parse the attributes, which also must begin 1823 // with a ' ' 1824 if (inBounds() && (isWhitespace(peek()) || isBidiControl(peek()))) { 1825 AttributeAdder<Markup::Builder> attrAdder(builder); 1826 parseAttributes(attrAdder, status); 1827 } 1828 1829 parseOptionalWhitespace(); 1830 1831 bool standalone = false; 1832 // Check if this is a standalone or not 1833 if (!closing) { 1834 if (inBounds() && peek() == SLASH) { 1835 standalone = true; 1836 normalizedInput += SLASH; 1837 next(); 1838 } 1839 } 1840 1841 parseToken(RIGHT_CURLY_BRACE, status); 1842 1843 if (standalone) { 1844 builder.setStandalone(); 1845 } else if (closing) { 1846 builder.setClose(); 1847 } else { 1848 builder.setOpen(); 1849 } 1850 1851 return builder.build(status); 1852 } 1853 1854 /* 1855 Consume a `placeholder`, matching the nonterminal in the grammar 1856 No postcondition (a placeholder can end a message) 1857 */ 1858 std::variant<Expression, Markup> Parser::parsePlaceholder(UErrorCode& status) { 1859 U_ASSERT(peek() == LEFT_CURLY_BRACE); 1860 1861 if (!inBounds()) { 1862 ERROR(status); 1863 return exprFallback(status); 1864 } 1865 1866 // Need to look ahead arbitrarily since whitespace 1867 // can appear before the '{' and '#' 1868 // in markup 1869 int32_t tempIndex = 1; 1870 bool isMarkup = false; 1871 while (inBounds(1)) { 1872 UChar32 c = peek(tempIndex); 1873 if (c == NUMBER_SIGN || c == SLASH) { 1874 isMarkup = true; 1875 break; 1876 } 1877 if (!(isWhitespace(c) || isBidiControl(c))) { 1878 break; 1879 } 1880 tempIndex++; 1881 } 1882 1883 if (isMarkup) { 1884 return parseMarkup(status); 1885 } 1886 return parseExpression(status); 1887 } 1888 1889 /* 1890 Consume a `simple-message`, matching the nonterminal in the grammar 1891 Postcondition: `index == len()` or U_FAILURE(status); 1892 for a syntactically correct message, this will consume the entire input 1893 */ 1894 Pattern Parser::parseSimpleMessage(UErrorCode& status) { 1895 Pattern::Builder result(status); 1896 1897 if (U_SUCCESS(status)) { 1898 Expression expression; 1899 while (inBounds()) { 1900 switch (peek()) { 1901 case LEFT_CURLY_BRACE: { 1902 // Must be placeholder 1903 std::variant<Expression, Markup> piece = parsePlaceholder(status); 1904 if (std::holds_alternative<Expression>(piece)) { 1905 Expression expr = *std::get_if<Expression>(&piece); 1906 result.add(std::move(expr), status); 1907 } else { 1908 Markup markup = *std::get_if<Markup>(&piece); 1909 result.add(std::move(markup), status); 1910 } 1911 break; 1912 } 1913 case BACKSLASH: { 1914 // Must be escaped-char 1915 result.add(parseEscapeSequence(status), status); 1916 break; 1917 } 1918 case RIGHT_CURLY_BRACE: { 1919 // Distinguish unescaped '}' from end of quoted pattern 1920 break; 1921 } 1922 default: { 1923 // Must be text-char 1924 result.add(parseTextChar(status), status); 1925 break; 1926 } 1927 } 1928 if (peek() == RIGHT_CURLY_BRACE) { 1929 // End of quoted pattern 1930 break; 1931 } 1932 // Don't loop infinitely 1933 if (errors.hasSyntaxError() || U_FAILURE(status)) { 1934 break; 1935 } 1936 } 1937 } 1938 return result.build(status); 1939 } 1940 1941 void Parser::parseVariant(UErrorCode& status) { 1942 CHECK_ERROR(status); 1943 1944 // At least one key is required 1945 SelectorKeys keyList(parseNonEmptyKeys(status)); 1946 1947 // parseNonEmptyKeys() consumes any trailing whitespace, 1948 // so the pattern can be consumed next. 1949 1950 // Restore precondition before calling parsePattern() 1951 // (which must return a non-null value) 1952 CHECK_BOUNDS(status); 1953 Pattern rhs = parseQuotedPattern(status); 1954 1955 dataModel.addVariant(std::move(keyList), std::move(rhs), status); 1956 } 1957 1958 /* 1959 Consume a `selectors` (matching the nonterminal in the grammar), 1960 followed by a non-empty sequence of `variant`s (matching the nonterminal 1961 in the grammar) preceded by whitespace 1962 No postcondition (on return, `index` might equal `len()` with no syntax error 1963 because a message can end with a variant) 1964 */ 1965 void Parser::parseSelectors(UErrorCode& status) { 1966 CHECK_ERROR(status); 1967 1968 U_ASSERT(inBounds()); 1969 1970 parseToken(ID_MATCH, status); 1971 1972 bool empty = true; 1973 // Parse selectors 1974 // "Backtracking" is required here. It's not clear if whitespace is 1975 // (`[s]` selector) or (`[s]` variant) 1976 while (isWhitespace(peek()) || peek() == DOLLAR) { 1977 int32_t whitespaceStart = index; 1978 parseRequiredWhitespace(status); 1979 // Restore precondition 1980 CHECK_BOUNDS(status); 1981 if (peek() != DOLLAR) { 1982 // This is not necessarily an error, but rather, 1983 // means the whitespace we parsed was the optional 1984 // whitespace preceding the first variant, not the 1985 // required whitespace preceding a subsequent variable. 1986 // In that case, "push back" the whitespace. 1987 normalizedInput.truncate(normalizedInput.length() - 1); 1988 index = whitespaceStart; 1989 break; 1990 } 1991 VariableName var = parseVariableName(status); 1992 empty = false; 1993 1994 dataModel.addSelector(std::move(var), status); 1995 CHECK_ERROR(status); 1996 } 1997 1998 // At least one selector is required 1999 if (empty) { 2000 ERROR(status); 2001 return; 2002 } 2003 2004 #define CHECK_END_OF_INPUT \ 2005 if (!inBounds()) { \ 2006 break; \ 2007 } \ 2008 2009 // Parse variants 2010 // matcher = match-statement s variant *(o variant) 2011 2012 // Parse first variant 2013 parseRequiredWhitespace(status); 2014 if (!inBounds()) { 2015 ERROR(status); 2016 return; 2017 } 2018 parseVariant(status); 2019 if (!inBounds()) { 2020 // Not an error; there might be only one variant 2021 return; 2022 } 2023 2024 while (isWhitespace(peek()) || isBidiControl(peek()) || isKeyStart(peek())) { 2025 parseOptionalWhitespace(); 2026 // Restore the precondition. 2027 // Trailing whitespace is allowed. 2028 if (!inBounds()) { 2029 return; 2030 } 2031 2032 parseVariant(status); 2033 2034 // Restore the precondition, *without* erroring out if we've 2035 // reached the end of input. That's because it's valid for the 2036 // message to end with a variant that has no trailing whitespace. 2037 // Why do we need to check this condition twice inside the loop? 2038 // Because if we don't check it here, the `isWhitespace()` call in 2039 // the loop head will read off the end of the input string. 2040 CHECK_END_OF_INPUT 2041 2042 if (errors.hasSyntaxError() || U_FAILURE(status)) { 2043 break; 2044 } 2045 } 2046 } 2047 2048 /* 2049 Consume a `body` (matching the nonterminal in the grammar), 2050 No postcondition (on return, `index` might equal `len()` with no syntax error, 2051 because a message can end with a body (trailing whitespace is optional) 2052 */ 2053 2054 void Parser::errorPattern(UErrorCode& status) { 2055 errors.addSyntaxError(status); 2056 // Set to empty pattern 2057 Pattern::Builder result = Pattern::Builder(status); 2058 CHECK_ERROR(status); 2059 2060 // If still in bounds, then add the remaining input as a single text part 2061 // to the pattern 2062 /* 2063 TODO: this behavior isn't documented in the spec, but it comes from 2064 https://github.com/messageformat/messageformat/blob/e0087bff312d759b67a9129eac135d318a1f0ce7/packages/mf2-messageformat/src/__fixtures/test-messages.json#L236 2065 and a pending pull request https://github.com/unicode-org/message-format-wg/pull/462 will clarify 2066 whether this is the intent behind the spec 2067 */ 2068 UnicodeString partStr(LEFT_CURLY_BRACE); 2069 while (inBounds()) { 2070 partStr += peek(); 2071 next(); 2072 } 2073 // Add curly braces around the entire output (same comment as above) 2074 partStr += RIGHT_CURLY_BRACE; 2075 result.add(std::move(partStr), status); 2076 dataModel.setPattern(result.build(status)); 2077 } 2078 2079 void Parser::parseBody(UErrorCode& status) { 2080 CHECK_ERROR(status); 2081 2082 // Out-of-input is a syntax warning 2083 if (!inBounds()) { 2084 errorPattern(status); 2085 return; 2086 } 2087 2088 // Body must be either a pattern or selectors 2089 switch (peek()) { 2090 case LEFT_CURLY_BRACE: { 2091 // Pattern 2092 dataModel.setPattern(parseQuotedPattern(status)); 2093 break; 2094 } 2095 case ID_MATCH[0]: { 2096 // Selectors 2097 parseSelectors(status); 2098 return; 2099 } 2100 default: { 2101 ERROR(status); 2102 errorPattern(status); 2103 return; 2104 } 2105 } 2106 } 2107 2108 // ------------------------------------- 2109 // Parses the source pattern. 2110 2111 void Parser::parse(UParseError &parseErrorResult, UErrorCode& status) { 2112 CHECK_ERROR(status); 2113 2114 bool complex = false; 2115 // First, "look ahead" to determine if this is a simple or complex 2116 // message. To do that, check the first non-whitespace character. 2117 while (inBounds(index) && (isWhitespace(peek()) || isBidiControl(peek()))) { 2118 next(); 2119 } 2120 2121 // Message can be empty, so we need to only look ahead 2122 // if we know it's non-empty 2123 if (inBounds()) { 2124 if (peek() == PERIOD 2125 || (inBounds(1) 2126 && peek() == LEFT_CURLY_BRACE 2127 && peek(1) == LEFT_CURLY_BRACE)) { 2128 complex = true; 2129 } 2130 } 2131 // Reset index 2132 index = 0; 2133 2134 // Message can be empty, so we need to only look ahead 2135 // if we know it's non-empty 2136 if (complex) { 2137 parseOptionalWhitespace(); 2138 parseDeclarations(status); 2139 parseBody(status); 2140 parseOptionalWhitespace(); 2141 } else { 2142 // Simple message 2143 // For normalization, quote the pattern 2144 normalizedInput += LEFT_CURLY_BRACE; 2145 normalizedInput += LEFT_CURLY_BRACE; 2146 dataModel.setPattern(parseSimpleMessage(status)); 2147 normalizedInput += RIGHT_CURLY_BRACE; 2148 normalizedInput += RIGHT_CURLY_BRACE; 2149 } 2150 2151 CHECK_ERROR(status); 2152 2153 // There are no errors; finally, check that the entire input was consumed 2154 if (!allConsumed()) { 2155 ERROR(status); 2156 } 2157 2158 // Finally, copy the relevant fields of the internal `MessageParseError` 2159 // into the `UParseError` argument 2160 translateParseError(parseError, parseErrorResult); 2161 } 2162 2163 Parser::~Parser() {} 2164 2165 } // namespace message2 2166 U_NAMESPACE_END 2167 2168 #endif /* #if !UCONFIG_NO_MF2 */ 2169 2170 #endif /* #if !UCONFIG_NO_FORMATTING */ 2171 2172 #endif /* #if !UCONFIG_NO_NORMALIZATION */