messageformat2_parser.h (8780B)
1 // © 2024 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 4 #include "unicode/utypes.h" 5 6 #ifndef U_HIDE_DEPRECATED_API 7 8 #ifndef MESSAGEFORMAT_PARSER_H 9 #define MESSAGEFORMAT_PARSER_H 10 11 #include "unicode/messageformat2_data_model.h" 12 #include "unicode/parseerr.h" 13 #include "unicode/uniset.h" 14 15 #include "messageformat2_allocation.h" 16 #include "messageformat2_errors.h" 17 18 #if U_SHOW_CPLUSPLUS_API 19 20 #if !UCONFIG_NO_NORMALIZATION 21 22 #if !UCONFIG_NO_FORMATTING 23 24 #if !UCONFIG_NO_MF2 25 26 U_NAMESPACE_BEGIN 27 28 namespace message2 { 29 30 using namespace data_model; 31 32 // Used for parameterizing options parsing code 33 // over the two builders that use it (Operator and Markup) 34 template <class T> 35 class OptionAdder { 36 private: 37 T& builder; 38 public: 39 OptionAdder(T& b) : builder(b) {} 40 void addOption(const UnicodeString& k, Operand&& r, UErrorCode& s) { 41 builder.addOption(k, std::move(r), s); 42 } 43 }; 44 45 // Used for parameterizing attributes parsing code 46 // over the two builders that use it (Expression and Markup) 47 // Unfortunately the same OptionAdder class can't just be reused, 48 // becaues duplicate options are forbidden while duplicate attributes are not 49 template <class T> 50 class AttributeAdder { 51 private: 52 T& builder; 53 public: 54 AttributeAdder(T& b) : builder(b) {} 55 void addAttribute(const UnicodeString& k, Operand&& r, UErrorCode& s) { 56 builder.addAttribute(k, std::move(r), s); 57 } 58 }; 59 60 61 // Initialization of UnicodeSets 62 namespace unisets { 63 enum Key { 64 CONTENT, 65 WHITESPACE, 66 BIDI, 67 ALPHA, 68 DIGIT, 69 NAME_START, 70 NAME_CHAR, 71 TEXT, 72 QUOTED, 73 ESCAPABLE, 74 UNISETS_KEY_COUNT 75 }; 76 77 U_I18N_API const UnicodeSet* get(Key key, UErrorCode& status); 78 } 79 80 // Parser class (private) 81 class Parser : public UMemory { 82 public: 83 virtual ~Parser(); 84 private: 85 friend class MessageFormatter; 86 87 void parse(UParseError&, UErrorCode&); 88 89 /* 90 Use an internal "parse error" structure to make it easier to translate 91 absolute offsets to line offsets. 92 This is translated back to a `UParseError` at the end of parsing. 93 */ 94 typedef struct MessageParseError { 95 // The line on which the error occurred 96 uint32_t line; 97 // The offset, relative to the erroneous line, on which the error occurred 98 uint32_t offset; 99 // The total number of characters seen before advancing to the current line. It has a value of 0 if line == 0. 100 // It includes newline characters, because the index does too. 101 uint32_t lengthBeforeCurrentLine; 102 103 // This parser doesn't yet use the last two fields. 104 UChar preContext[U_PARSE_CONTEXT_LEN]; 105 UChar postContext[U_PARSE_CONTEXT_LEN]; 106 } MessageParseError; 107 108 Parser(const UnicodeString &input, 109 MFDataModel::Builder& dataModelBuilder, 110 StaticErrors& e, 111 UnicodeString& normalizedInputRef, 112 UErrorCode& status) 113 : contentChars(unisets::get(unisets::CONTENT, status)), 114 whitespaceChars(unisets::get(unisets::WHITESPACE, status)), 115 bidiControlChars(unisets::get(unisets::BIDI, status)), 116 alphaChars(unisets::get(unisets::ALPHA, status)), 117 digitChars(unisets::get(unisets::DIGIT, status)), 118 nameStartChars(unisets::get(unisets::NAME_START, status)), 119 nameChars(unisets::get(unisets::NAME_CHAR, status)), 120 textChars(unisets::get(unisets::TEXT, status)), 121 quotedChars(unisets::get(unisets::QUOTED, status)), 122 escapableChars(unisets::get(unisets::ESCAPABLE, status)), 123 source(input), index(0), errors(e), normalizedInput(normalizedInputRef), dataModel(dataModelBuilder) { 124 (void) status; 125 parseError.line = 0; 126 parseError.offset = 0; 127 parseError.lengthBeforeCurrentLine = 0; 128 parseError.preContext[0] = '\0'; 129 parseError.postContext[0] = '\0'; 130 } 131 132 bool isContentChar(UChar32) const; 133 bool isBidiControl(UChar32) const; 134 bool isWhitespace(UChar32) const; 135 bool isTextChar(UChar32) const; 136 bool isQuotedChar(UChar32) const; 137 bool isEscapableChar(UChar32) const; 138 bool isAlpha(UChar32) const; 139 bool isDigit(UChar32) const; 140 bool isNameStart(UChar32) const; 141 bool isNameChar(UChar32) const; 142 bool isUnquotedStart(UChar32) const; 143 bool isLiteralStart(UChar32) const; 144 bool isKeyStart(UChar32) const; 145 146 static void translateParseError(const MessageParseError&, UParseError&); 147 static void setParseError(MessageParseError&, uint32_t); 148 void maybeAdvanceLine(); 149 Pattern parseSimpleMessage(UErrorCode&); 150 void parseBody(UErrorCode&); 151 void parseDeclarations(UErrorCode&); 152 void parseUnsupportedStatement(UErrorCode&); 153 void parseLocalDeclaration(UErrorCode&); 154 void parseInputDeclaration(UErrorCode&); 155 void parseSelectors(UErrorCode&); 156 void parseVariant(UErrorCode&); 157 158 void parseRequiredWS(UErrorCode&); 159 void parseRequiredWhitespace(UErrorCode&); 160 void parseOptionalBidi(); 161 void parseOptionalWhitespace(); 162 void parseToken(UChar32, UErrorCode&); 163 void parseTokenWithWhitespace(UChar32, UErrorCode&); 164 void parseToken(const std::u16string_view&, UErrorCode&); 165 void parseTokenWithWhitespace(const std::u16string_view&, UErrorCode&); 166 bool nextIs(const std::u16string_view&) const; 167 UnicodeString parseNameChars(UnicodeString&, UErrorCode&); 168 UnicodeString parseName(UErrorCode&); 169 UnicodeString parseIdentifier(UErrorCode&); 170 UnicodeString parseDigits(UErrorCode&); 171 VariableName parseVariableName(UErrorCode&); 172 FunctionName parseFunction(UErrorCode&); 173 UnicodeString parseEscapeSequence(UErrorCode&); 174 Literal parseUnquotedLiteral(UErrorCode&); 175 Literal parseQuotedLiteral(UErrorCode&); 176 Literal parseLiteral(UErrorCode&); 177 template<class T> 178 void parseAttribute(AttributeAdder<T>&, UErrorCode&); 179 template<class T> 180 void parseAttributes(AttributeAdder<T>&, UErrorCode&); 181 template<class T> 182 void parseOption(OptionAdder<T>&, UErrorCode&); 183 template<class T> 184 void parseOptions(OptionAdder<T>&, UErrorCode&); 185 Operator parseAnnotation(UErrorCode&); 186 void parseLiteralOrVariableWithAnnotation(bool, Expression::Builder&, UErrorCode&); 187 Markup parseMarkup(UErrorCode&); 188 Expression parseExpression(UErrorCode&); 189 std::variant<Expression, Markup> parsePlaceholder(UErrorCode&); 190 UnicodeString parseTextChar(UErrorCode&); 191 Key parseKey(UErrorCode&); 192 SelectorKeys parseNonEmptyKeys(UErrorCode&); 193 void errorPattern(UErrorCode& status); 194 Pattern parseQuotedPattern(UErrorCode&); 195 bool isDeclarationStart(); 196 197 UChar32 peek() const { return source.char32At(index) ; } 198 UChar32 peek(uint32_t i) const { 199 return source.char32At(source.moveIndex32(index, i)); 200 } 201 void next() { index = source.moveIndex32(index, 1); } 202 203 bool inBounds() const { return (int32_t) index < source.length(); } 204 bool inBounds(uint32_t i) const { return source.moveIndex32(index, i) < source.length(); } 205 bool allConsumed() const { return (int32_t) index == source.length(); } 206 207 // UnicodeSets for checking character ranges 208 const UnicodeSet* contentChars; 209 const UnicodeSet* whitespaceChars; 210 const UnicodeSet* bidiControlChars; 211 const UnicodeSet* alphaChars; 212 const UnicodeSet* digitChars; 213 const UnicodeSet* nameStartChars; 214 const UnicodeSet* nameChars; 215 const UnicodeSet* textChars; 216 const UnicodeSet* quotedChars; 217 const UnicodeSet* escapableChars; 218 219 // The input string 220 const UnicodeString &source; 221 // The current position within the input string -- counting in UChar32 222 uint32_t index; 223 // Represents the current line (and when an error is indicated), 224 // character offset within the line of the parse error 225 MessageParseError parseError; 226 227 // The structure to use for recording errors 228 StaticErrors& errors; 229 230 // Normalized version of the input string (optional whitespace removed) 231 UnicodeString& normalizedInput; 232 233 // The parent builder 234 MFDataModel::Builder &dataModel; 235 236 }; // class Parser 237 } // namespace message2 238 239 U_NAMESPACE_END 240 241 #endif /* #if !UCONFIG_NO_MF2 */ 242 243 #endif /* #if !UCONFIG_NO_FORMATTING */ 244 245 #endif /* #if !UCONFIG_NO_NORMALIZATION */ 246 247 #endif /* U_SHOW_CPLUSPLUS_API */ 248 249 #endif // MESSAGEFORMAT_PARSER_H 250 251 #endif // U_HIDE_DEPRECATED_API 252 // eof