read.c (13517B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 1998-2012, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * 11 * File read.c 12 * 13 * Modification History: 14 * 15 * Date Name Description 16 * 05/26/99 stephen Creation. 17 * 5/10/01 Ram removed ustdio dependency 18 ******************************************************************************* 19 */ 20 21 #include <stdbool.h> 22 23 #include "read.h" 24 #include "errmsg.h" 25 #include "toolutil.h" 26 #include "unicode/ustring.h" 27 #include "unicode/utf16.h" 28 29 #define OPENBRACE 0x007B 30 #define CLOSEBRACE 0x007D 31 #define COMMA 0x002C 32 #define QUOTE 0x0022 33 #define ESCAPE 0x005C 34 #define SLASH 0x002F 35 #define ASTERISK 0x002A 36 #define SPACE 0x0020 37 #define COLON 0x003A 38 #define BADBOM 0xFFFE 39 #define CR 0x000D 40 #define LF 0x000A 41 42 static int32_t lineCount; 43 44 /* Protos */ 45 static enum ETokenType getStringToken(UCHARBUF *buf, 46 UChar32 initialChar, 47 struct UString *token, 48 UErrorCode *status); 49 50 static UChar32 getNextChar (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status); 51 static void seekUntilNewline (UCHARBUF *buf, struct UString *token, UErrorCode *status); 52 static void seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status); 53 static UBool isWhitespace (UChar32 c); 54 static UBool isNewline (UChar32 c); 55 56 U_CFUNC void resetLineNumber(void) { 57 lineCount = 1; 58 } 59 60 /* Read and return the next token from the stream. If the token is of 61 type eString, fill in the token parameter with the token. If the 62 token is eError, then the status parameter will contain the 63 specific error. This will be eItemNotFound at the end of file, 64 indicating that all tokens have been returned. This method will 65 never return eString twice in a row; instead, multiple adjacent 66 string tokens will be merged into one, with no intervening 67 space. */ 68 U_CFUNC enum ETokenType 69 getNextToken(UCHARBUF* buf, 70 struct UString *token, 71 uint32_t *linenumber, /* out: linenumber of token */ 72 struct UString *comment, 73 UErrorCode *status) { 74 enum ETokenType result; 75 UChar32 c; 76 77 if (U_FAILURE(*status)) { 78 return TOK_ERROR; 79 } 80 81 /* Skip whitespace */ 82 c = getNextChar(buf, true, comment, status); 83 84 if (U_FAILURE(*status)) { 85 return TOK_ERROR; 86 } 87 88 *linenumber = lineCount; 89 90 switch(c) { 91 case BADBOM: 92 return TOK_ERROR; 93 case OPENBRACE: 94 return TOK_OPEN_BRACE; 95 case CLOSEBRACE: 96 return TOK_CLOSE_BRACE; 97 case COMMA: 98 return TOK_COMMA; 99 case U_EOF: 100 return TOK_EOF; 101 case COLON: 102 return TOK_COLON; 103 104 default: 105 result = getStringToken(buf, c, token, status); 106 } 107 108 *linenumber = lineCount; 109 return result; 110 } 111 112 /* Copy a string token into the given UnicodeString. Upon entry, we 113 have already read the first character of the string token, which is 114 not a whitespace character (but may be a QUOTE or ESCAPE). This 115 function reads all subsequent characters that belong with this 116 string, and copy them into the token parameter. The other 117 important, and slightly convoluted purpose of this function is to 118 merge adjacent strings. It looks forward a bit, and if the next 119 non comment, non whitespace item is a string, it reads it in as 120 well. If two adjacent strings are quoted, they are merged without 121 intervening space. Otherwise a single SPACE character is 122 inserted. */ 123 static enum ETokenType getStringToken(UCHARBUF* buf, 124 UChar32 initialChar, 125 struct UString *token, 126 UErrorCode *status) { 127 UBool lastStringWasQuoted; 128 UChar32 c; 129 UChar target[3] = { '\0' }; 130 UChar *pTarget = target; 131 int len=0; 132 UBool isFollowingCharEscaped=false; 133 UBool isNLUnescaped = false; 134 UChar32 prevC=0; 135 136 /* We are guaranteed on entry that initialChar is not a whitespace 137 character. If we are at the EOF, or have some other problem, it 138 doesn't matter; we still want to validly return the initialChar 139 (if nothing else) as a string token. */ 140 141 if (U_FAILURE(*status)) { 142 return TOK_ERROR; 143 } 144 145 /* setup */ 146 lastStringWasQuoted = false; 147 c = initialChar; 148 ustr_setlen(token, 0, status); 149 150 if (U_FAILURE(*status)) { 151 return TOK_ERROR; 152 } 153 154 for (;;) { 155 if (c == QUOTE) { 156 if (!lastStringWasQuoted && token->fLength > 0) { 157 ustr_ucat(token, SPACE, status); 158 159 if (U_FAILURE(*status)) { 160 return TOK_ERROR; 161 } 162 } 163 164 lastStringWasQuoted = true; 165 166 for (;;) { 167 c = ucbuf_getc(buf,status); 168 169 /* EOF reached */ 170 if (c == U_EOF) { 171 return TOK_EOF; 172 } 173 174 /* Unterminated quoted strings */ 175 if (U_FAILURE(*status)) { 176 return TOK_ERROR; 177 } 178 179 if (c == QUOTE && !isFollowingCharEscaped) { 180 break; 181 } 182 183 if (c == ESCAPE && !isFollowingCharEscaped) { 184 pTarget = target; 185 c = unescape(buf, status); 186 187 if (c == U_ERR) { 188 return TOK_ERROR; 189 } 190 if(c == CR || c == LF){ 191 isNLUnescaped = true; 192 } 193 } 194 195 if(c==ESCAPE && !isFollowingCharEscaped){ 196 isFollowingCharEscaped = true; 197 }else{ 198 U_APPEND_CHAR32(c, pTarget,len); 199 pTarget = target; 200 ustr_uscat(token, pTarget,len, status); 201 isFollowingCharEscaped = false; 202 len=0; 203 if(c == CR || c == LF){ 204 if(isNLUnescaped == false && prevC!=CR){ 205 lineCount++; 206 } 207 isNLUnescaped = false; 208 } 209 } 210 211 if (U_FAILURE(*status)) { 212 return TOK_ERROR; 213 } 214 prevC = c; 215 } 216 } else { 217 if (token->fLength > 0) { 218 ustr_ucat(token, SPACE, status); 219 220 if (U_FAILURE(*status)) { 221 return TOK_ERROR; 222 } 223 } 224 225 if(lastStringWasQuoted){ 226 if(getShowWarning()){ 227 warning(lineCount, "Mixing quoted and unquoted strings"); 228 } 229 if(isStrict()){ 230 return TOK_ERROR; 231 } 232 233 } 234 235 lastStringWasQuoted = false; 236 237 /* if we reach here we are mixing 238 * quoted and unquoted strings 239 * warn in normal mode and error in 240 * pedantic mode 241 */ 242 243 if (c == ESCAPE) { 244 pTarget = target; 245 c = unescape(buf, status); 246 247 /* EOF reached */ 248 if (c == U_EOF) { 249 return TOK_ERROR; 250 } 251 } 252 253 U_APPEND_CHAR32(c, pTarget,len); 254 pTarget = target; 255 ustr_uscat(token, pTarget,len, status); 256 len=0; 257 258 if (U_FAILURE(*status)) { 259 return TOK_ERROR; 260 } 261 262 for (;;) { 263 /* DON'T skip whitespace */ 264 c = getNextChar(buf, false, NULL, status); 265 266 /* EOF reached */ 267 if (c == U_EOF) { 268 ucbuf_ungetc(c, buf); 269 return TOK_STRING; 270 } 271 272 if (U_FAILURE(*status)) { 273 return TOK_STRING; 274 } 275 276 if (c == QUOTE 277 || c == OPENBRACE 278 || c == CLOSEBRACE 279 || c == COMMA 280 || c == COLON) { 281 ucbuf_ungetc(c, buf); 282 break; 283 } 284 285 if (isWhitespace(c)) { 286 break; 287 } 288 289 if (c == ESCAPE) { 290 pTarget = target; 291 c = unescape(buf, status); 292 293 if (c == U_ERR) { 294 return TOK_ERROR; 295 } 296 } 297 298 U_APPEND_CHAR32(c, pTarget,len); 299 pTarget = target; 300 ustr_uscat(token, pTarget,len, status); 301 len=0; 302 if (U_FAILURE(*status)) { 303 return TOK_ERROR; 304 } 305 } 306 } 307 308 /* DO skip whitespace */ 309 c = getNextChar(buf, true, NULL, status); 310 311 if (U_FAILURE(*status)) { 312 return TOK_STRING; 313 } 314 315 if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) { 316 ucbuf_ungetc(c, buf); 317 return TOK_STRING; 318 } 319 } 320 } 321 322 /* Retrieve the next character. If skipwhite is 323 true, whitespace is skipped as well. */ 324 static UChar32 getNextChar(UCHARBUF* buf, 325 UBool skipwhite, 326 struct UString *token, 327 UErrorCode *status) { 328 UChar32 c, c2; 329 330 if (U_FAILURE(*status)) { 331 return U_EOF; 332 } 333 334 for (;;) { 335 c = ucbuf_getc(buf,status); 336 337 if (c == U_EOF) { 338 return U_EOF; 339 } 340 341 if (skipwhite && isWhitespace(c)) { 342 continue; 343 } 344 345 /* This also handles the get() failing case */ 346 if (c != SLASH) { 347 return c; 348 } 349 350 c = ucbuf_getc(buf,status); /* "/c" */ 351 352 if (c == U_EOF) { 353 return U_EOF; 354 } 355 356 switch (c) { 357 case SLASH: /* "//" */ 358 seekUntilNewline(buf, NULL, status); 359 break; 360 361 case ASTERISK: /* " / * " */ 362 c2 = ucbuf_getc(buf, status); /* "/ * c" */ 363 if(c2 == ASTERISK){ /* "/ * *" */ 364 /* parse multi-line comment and store it in token*/ 365 seekUntilEndOfComment(buf, token, status); 366 } else { 367 ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *". Include c2 back in buffer. */ 368 seekUntilEndOfComment(buf, NULL, status); 369 } 370 break; 371 372 default: 373 ucbuf_ungetc(c, buf); /* "/c" - put back the c */ 374 /* If get() failed this is a NOP */ 375 return SLASH; 376 } 377 378 } 379 } 380 381 static void seekUntilNewline(UCHARBUF* buf, 382 struct UString *token, 383 UErrorCode *status) { 384 UChar32 c; 385 386 if (U_FAILURE(*status)) { 387 return; 388 } 389 390 do { 391 c = ucbuf_getc(buf,status); 392 /* add the char to token */ 393 if(token!=NULL){ 394 ustr_u32cat(token, c, status); 395 } 396 } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR); 397 } 398 399 static void seekUntilEndOfComment(UCHARBUF *buf, 400 struct UString *token, 401 UErrorCode *status) { 402 UChar32 c, d; 403 uint32_t line; 404 405 if (U_FAILURE(*status)) { 406 return; 407 } 408 409 line = lineCount; 410 411 do { 412 c = ucbuf_getc(buf, status); 413 414 if (c == ASTERISK) { 415 d = ucbuf_getc(buf, status); 416 417 if (d != SLASH) { 418 ucbuf_ungetc(d, buf); 419 } else { 420 break; 421 } 422 } 423 /* add the char to token */ 424 if(token!=NULL){ 425 ustr_u32cat(token, c, status); 426 } 427 /* increment the lineCount */ 428 isNewline(c); 429 430 } while (c != U_EOF && *status == U_ZERO_ERROR); 431 432 if (c == U_EOF) { 433 *status = U_INVALID_FORMAT_ERROR; 434 error(line, "unterminated comment detected"); 435 } 436 } 437 438 U_CFUNC UChar32 unescape(UCHARBUF *buf, UErrorCode *status) { 439 if (U_FAILURE(*status)) { 440 return U_EOF; 441 } 442 443 /* We expect to be called after the ESCAPE has been seen, but 444 * u_fgetcx needs an ESCAPE to do its magic. */ 445 ucbuf_ungetc(ESCAPE, buf); 446 447 return ucbuf_getcx32(buf, status); 448 } 449 450 static UBool isWhitespace(UChar32 c) { 451 switch (c) { 452 /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */ 453 case 0x000A: 454 case 0x2029: 455 lineCount++; 456 U_FALLTHROUGH; 457 case 0x000D: 458 case 0x0020: 459 case 0x0009: 460 case 0xFEFF: 461 return true; 462 463 default: 464 return false; 465 } 466 } 467 468 static UBool isNewline(UChar32 c) { 469 switch (c) { 470 /* '\n', '\r', 0x2029 */ 471 case 0x000A: 472 case 0x2029: 473 lineCount++; 474 U_FALLTHROUGH; 475 case 0x000D: 476 return true; 477 478 default: 479 return false; 480 } 481 }