lib.rs (41207B)
1 /* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 5 //! This crate implements a prefs file parser. 6 //! 7 //! Pref files have the following grammar. Note that there are slight 8 //! differences between the grammar for a default prefs files and a user prefs 9 //! file. 10 //! 11 //! ```text 12 //! <pref-file> = <pref>* 13 //! <pref> = <pref-spec> "(" <pref-name> "," <pref-value> <pref-attrs> ")" ";" 14 //! <pref-spec> = "user_pref" | "pref" | "sticky_pref" // in default pref files 15 //! <pref-spec> = "user_pref" // in user pref files 16 //! <pref-name> = <string-literal> 17 //! <pref-value> = <string-literal> | "true" | "false" | <int-value> 18 //! <int-value> = <sign>? <int-literal> 19 //! <sign> = "+" | "-" 20 //! <int-literal> = [0-9]+ (and cannot be followed by [A-Za-z_]) 21 //! <string-literal> = 22 //! A single or double-quoted string, with the following escape sequences 23 //! allowed: \", \', \\, \n, \r, \xNN, \uNNNN, where \xNN gives a raw byte 24 //! value that is copied directly into an 8-bit string value, and \uNNNN 25 //! gives a UTF-16 code unit that is converted to UTF-8 before being copied 26 //! into an 8-bit string value. \x00 and \u0000 are disallowed because they 27 //! would cause C++ code handling such strings to misbehave. 28 //! <pref-attrs> = ("," <pref-attr>)* // in default pref files 29 //! = <empty> // in user pref files 30 //! <pref-attr> = "sticky" | "locked" // default pref files only 31 //! ``` 32 //! 33 //! Comments can take three forms: 34 //! - `# Python-style comments` 35 //! - `// C++ style comments` 36 //! - `/* C style comments (non-nested) */` 37 //! 38 //! Non-end-of-line whitespace chars are `\t`, `\v`, `\f`, and space. 39 //! 40 //! End-of-line sequences can take three forms, each of which is considered as 41 //! a single EOL: 42 //! - `\n` 43 //! - `\r` (without subsequent `\n`) 44 //! - `\r\n` 45 //! 46 //! The valid range for `<int-value>` is -2,147,483,648..2,147,483,647. Values 47 //! outside that range will result in a parse error. 48 //! 49 //! A `\0` char is interpreted as the end of the file. The use of this character 50 //! in a prefs file is not recommended. Within string literals `\x00` or 51 //! `\u0000` can be used instead. 52 //! 53 //! The parser performs error recovery. On a syntax error, it will scan forward 54 //! to the next `;` token and then continue parsing. If the syntax error occurs 55 //! in the middle of a token, it will first finish obtaining the current token 56 //! in an appropriate fashion. 57 58 // This parser uses several important optimizations. 59 // 60 // - Because "`\0` means EOF" is part of the grammar (see above), EOF is 61 // representable by a u8. If EOF was represented by an out-of-band value such 62 // as -1 or 256, we'd have to return a larger type such as `u16` or `i16` 63 // from `get_char()`. 64 // 65 // - When starting a new token, it uses a lookup table with the first char, 66 // which quickly identifies what kind of token it will be. Furthermore, if 67 // that token is an unambiguous single-char token (e.g. `(`, `)`, `+`, `,`, 68 // `-`, `;`), the parser will return the appropriate token kind value at 69 // minimal cost because the single-char tokens have a uniform representation. 70 // 71 // - It has a lookup table that identifies chars in string literals that need 72 // special handling. This means non-special chars (the common case) can be 73 // handled with a single test, rather than testing for the multiple special 74 // cases. 75 // 76 // - It pre-scans string literals for special chars. If none are present, it 77 // bulk copies the string literal into a Vec, which is faster than doing a 78 // char-by-char copy. 79 // 80 // - It reuses Vecs to avoid creating a new one for each string literal. 81 82 use std::os::raw::{c_char, c_uchar}; 83 84 //--------------------------------------------------------------------------- 85 // The public interface 86 //--------------------------------------------------------------------------- 87 88 /// Keep this in sync with PrefType in Preferences.cpp. 89 #[derive(Clone, Copy, Debug, PartialEq)] 90 #[repr(u8)] 91 pub enum PrefType { 92 None, 93 String, 94 Int, 95 Bool, 96 } 97 98 /// Keep this in sync with PrefValueKind in Preferences.h. 99 #[derive(Clone, Copy, Debug, PartialEq)] 100 #[repr(u8)] 101 pub enum PrefValueKind { 102 Default, 103 User, 104 } 105 106 /// Keep this in sync with PrefValue in Preferences.cpp. 107 #[repr(C)] 108 pub union PrefValue { 109 pub string_val: *const c_char, 110 pub int_val: i32, 111 pub bool_val: bool, 112 } 113 114 /// Keep this in sync with PrefsParserPrefFn in Preferences.cpp. 115 type PrefFn = unsafe extern "C" fn( 116 pref_name: *const c_char, 117 pref_type: PrefType, 118 pref_value_kind: PrefValueKind, 119 pref_value: PrefValue, 120 is_sticky: bool, 121 is_locked: bool, 122 ); 123 124 /// Keep this in sync with PrefsParserErrorFn in Preferences.cpp. 125 type ErrorFn = unsafe extern "C" fn(msg: *const c_char); 126 127 /// Parse the contents of a prefs file. 128 /// 129 /// `buf` is a null-terminated string. `len` is its length, excluding the 130 /// null terminator. 131 /// 132 /// `pref_fn` is called once for each successfully parsed pref. 133 /// 134 /// `error_fn` is called once for each parse error detected. 135 /// 136 /// Keep this in sync with the prefs_parser_parse() declaration in 137 /// Preferences.cpp. 138 #[no_mangle] 139 pub unsafe extern "C" fn prefs_parser_parse( 140 path: *const c_char, 141 kind: PrefValueKind, 142 buf: *const c_char, 143 len: usize, 144 pref_fn: PrefFn, 145 error_fn: ErrorFn, 146 ) -> bool { 147 let path = std::ffi::CStr::from_ptr(path) 148 .to_string_lossy() 149 .into_owned(); 150 151 // Make sure `buf` ends in a '\0', and include that in the length, because 152 // it represents EOF. 153 let buf = std::slice::from_raw_parts(buf as *const c_uchar, len + 1); 154 assert!(buf.last() == Some(&EOF)); 155 156 let mut parser = Parser::new(&path, kind, &buf, pref_fn, error_fn); 157 parser.parse() 158 } 159 160 //--------------------------------------------------------------------------- 161 // The implementation 162 //--------------------------------------------------------------------------- 163 164 #[derive(Clone, Copy, Debug, PartialEq)] 165 enum Token { 166 // Unambiguous single-char tokens. 167 SingleChar(u8), 168 169 // Keywords 170 Pref, // pref 171 StickyPref, // sticky_pref 172 UserPref, // user_pref 173 True, // true 174 False, // false 175 Sticky, // sticky 176 Locked, // locked 177 178 // String literal, e.g. '"string"'. The value is stored elsewhere. 179 String, 180 181 // Unsigned integer literal, e.g. '123'. Although libpref uses i32 values, 182 // any '-' and '+' before an integer literal are treated as separate 183 // tokens, so these token values are always positive. Furthermore, we 184 // tokenize int literals as u32 so that 2147483648 (which doesn't fit into 185 // an i32) can be subsequently negated to -2147483648 (which does fit into 186 // an i32) if a '-' token precedes it. 187 Int(u32), 188 189 // Malformed token. 190 Error(&'static str), 191 192 // Malformed token at a particular line number. For use when 193 // Parser::line_num might not be the right line number when the error is 194 // reported. E.g. if a multi-line string has a bad escape sequence on the 195 // first line, we don't report the error until the string's end has been 196 // reached. 197 ErrorAtLine(&'static str, u32), 198 } 199 200 // We categorize every char by what action should be taken when it appears at 201 // the start of a new token. 202 #[derive(Clone, Copy, PartialEq)] 203 enum CharKind { 204 // These are ordered by frequency. See the comment in GetToken(). 205 SingleChar, // Unambiguous single-char tokens: [()+,-] or EOF 206 SpaceNL, // [\t\v\f \n] 207 Keyword, // [A-Za-z_] 208 Quote, // ["'] 209 Slash, // / 210 Digit, // [0-9] 211 Hash, // # 212 CR, // \r 213 Other, // Everything else; invalid except within strings and comments. 214 } 215 216 const C_SINGL: CharKind = CharKind::SingleChar; 217 const C_SPCNL: CharKind = CharKind::SpaceNL; 218 const C_KEYWD: CharKind = CharKind::Keyword; 219 const C_QUOTE: CharKind = CharKind::Quote; 220 const C_SLASH: CharKind = CharKind::Slash; 221 const C_DIGIT: CharKind = CharKind::Digit; 222 const C_HASH_: CharKind = CharKind::Hash; 223 const C_CR___: CharKind = CharKind::CR; 224 const C______: CharKind = CharKind::Other; 225 226 #[rustfmt::skip] 227 const CHAR_KINDS: [CharKind; 256] = [ 228 /* 0 1 2 3 4 5 6 7 8 9 */ 229 /* 0+ */ C_SINGL, C______, C______, C______, C______, C______, C______, C______, C______, C_SPCNL, 230 /* 10+ */ C_SPCNL, C_SPCNL, C_SPCNL, C_CR___, C______, C______, C______, C______, C______, C______, 231 /* 20+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______, 232 /* 30+ */ C______, C______, C_SPCNL, C______, C_QUOTE, C_HASH_, C______, C______, C______, C_QUOTE, 233 /* 40+ */ C_SINGL, C_SINGL, C______, C_SINGL, C_SINGL, C_SINGL, C______, C_SLASH, C_DIGIT, C_DIGIT, 234 /* 50+ */ C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C______, C_SINGL, 235 /* 60+ */ C______, C______, C______, C______, C______, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, 236 /* 70+ */ C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, 237 /* 80+ */ C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, 238 /* 90+ */ C_KEYWD, C______, C______, C______, C______, C_KEYWD, C______, C_KEYWD, C_KEYWD, C_KEYWD, 239 /* 100+ */ C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, 240 /* 110+ */ C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, 241 /* 120+ */ C_KEYWD, C_KEYWD, C_KEYWD, C______, C______, C______, C______, C______, C______, C______, 242 /* 130+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______, 243 /* 140+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______, 244 /* 150+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______, 245 /* 160+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______, 246 /* 170+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______, 247 /* 180+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______, 248 /* 190+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______, 249 /* 200+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______, 250 /* 210+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______, 251 /* 220+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______, 252 /* 230+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______, 253 /* 240+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______, 254 /* 250+ */ C______, C______, C______, C______, C______, C______ 255 ]; 256 257 const _______: bool = false; 258 #[rustfmt::skip] 259 const SPECIAL_STRING_CHARS: [bool; 256] = [ 260 /* 0 1 2 3 4 5 6 7 8 9 */ 261 /* 0+ */ true, _______, _______, _______, _______, _______, _______, _______, _______, _______, 262 /* 10+ */ true, _______, _______, true, _______, _______, _______, _______, _______, _______, 263 /* 20+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, 264 /* 30+ */ _______, _______, _______, _______, true, _______, _______, _______, _______, true, 265 /* 40+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, 266 /* 50+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, 267 /* 60+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, 268 /* 70+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, 269 /* 80+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, 270 /* 90+ */ _______, _______, true, _______, _______, _______, _______, _______, _______, _______, 271 /* 100+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, 272 /* 110+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, 273 /* 120+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, 274 /* 130+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, 275 /* 140+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, 276 /* 150+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, 277 /* 160+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, 278 /* 170+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, 279 /* 180+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, 280 /* 190+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, 281 /* 200+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, 282 /* 210+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, 283 /* 220+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, 284 /* 230+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, 285 /* 240+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, 286 /* 250+ */ _______, _______, _______, _______, _______, _______ 287 ]; 288 289 struct KeywordInfo { 290 string: &'static [u8], 291 token: Token, 292 } 293 294 const KEYWORD_INFOS: [KeywordInfo; 7] = [ 295 // These are ordered by frequency. 296 KeywordInfo { 297 string: b"pref", 298 token: Token::Pref, 299 }, 300 KeywordInfo { 301 string: b"true", 302 token: Token::True, 303 }, 304 KeywordInfo { 305 string: b"false", 306 token: Token::False, 307 }, 308 KeywordInfo { 309 string: b"user_pref", 310 token: Token::UserPref, 311 }, 312 KeywordInfo { 313 string: b"sticky", 314 token: Token::Sticky, 315 }, 316 KeywordInfo { 317 string: b"locked", 318 token: Token::Locked, 319 }, 320 KeywordInfo { 321 string: b"sticky_pref", 322 token: Token::StickyPref, 323 }, 324 ]; 325 326 struct Parser<'t> { 327 path: &'t str, // Path to the file being parsed. Used in error messages. 328 kind: PrefValueKind, // Default prefs file or user prefs file? 329 buf: &'t [u8], // Text being parsed. 330 i: usize, // Index of next char to be read. 331 line_num: u32, // Current line number within the text. 332 pref_fn: PrefFn, // Callback for processing each pref. 333 error_fn: ErrorFn, // Callback for parse errors. 334 has_errors: bool, // Have we encountered errors? 335 } 336 337 // As described above, we use 0 to represent EOF. 338 const EOF: u8 = b'\0'; 339 340 impl<'t> Parser<'t> { 341 fn new( 342 path: &'t str, 343 kind: PrefValueKind, 344 buf: &'t [u8], 345 pref_fn: PrefFn, 346 error_fn: ErrorFn, 347 ) -> Parser<'t> { 348 // Make sure these tables take up 1 byte per entry. 349 assert!(std::mem::size_of_val(&CHAR_KINDS) == 256); 350 assert!(std::mem::size_of_val(&SPECIAL_STRING_CHARS) == 256); 351 352 Parser { 353 path: path, 354 kind: kind, 355 buf: buf, 356 i: 0, 357 line_num: 1, 358 pref_fn: pref_fn, 359 error_fn: error_fn, 360 has_errors: false, 361 } 362 } 363 364 fn parse(&mut self) -> bool { 365 // These are reused, because allocating a new Vec for every string is slow. 366 let mut name_str = Vec::with_capacity(128); // For pref names. 367 let mut value_str = Vec::with_capacity(512); // For string pref values. 368 let mut none_str = Vec::with_capacity(0); // For tokens that shouldn't be strings. 369 370 let mut token = self.get_token(&mut none_str); 371 372 // At the top of the loop we already have a token. In a valid input 373 // this will be either the first token of a new pref, or EOF. 374 loop { 375 // <pref-spec> 376 let (pref_value_kind, mut is_sticky) = match token { 377 Token::Pref if self.kind == PrefValueKind::Default => { 378 (PrefValueKind::Default, false) 379 } 380 Token::StickyPref if self.kind == PrefValueKind::Default => { 381 (PrefValueKind::Default, true) 382 } 383 Token::UserPref => (PrefValueKind::User, false), 384 Token::SingleChar(EOF) => return !self.has_errors, 385 _ => { 386 token = self.error_and_recover( 387 token, 388 if self.kind == PrefValueKind::Default { 389 "expected pref specifier at start of pref definition" 390 } else { 391 "expected 'user_pref' at start of pref definition" 392 }, 393 ); 394 continue; 395 } 396 }; 397 398 // "(" 399 token = self.get_token(&mut none_str); 400 if token != Token::SingleChar(b'(') { 401 token = self.error_and_recover(token, "expected '(' after pref specifier"); 402 continue; 403 } 404 405 // <pref-name> 406 token = self.get_token(&mut name_str); 407 let pref_name = if token == Token::String { 408 &name_str 409 } else { 410 token = self.error_and_recover(token, "expected pref name after '('"); 411 continue; 412 }; 413 414 // "," 415 token = self.get_token(&mut none_str); 416 if token != Token::SingleChar(b',') { 417 token = self.error_and_recover(token, "expected ',' after pref name"); 418 continue; 419 } 420 421 // <pref-value> 422 token = self.get_token(&mut value_str); 423 let (pref_type, pref_value) = match token { 424 Token::True => (PrefType::Bool, PrefValue { bool_val: true }), 425 Token::False => (PrefType::Bool, PrefValue { bool_val: false }), 426 Token::String => ( 427 PrefType::String, 428 PrefValue { 429 string_val: value_str.as_ptr() as *const c_char, 430 }, 431 ), 432 Token::Int(u) => { 433 // Accept u <= 2147483647; anything larger will overflow i32. 434 if u <= std::i32::MAX as u32 { 435 (PrefType::Int, PrefValue { int_val: u as i32 }) 436 } else { 437 token = 438 self.error_and_recover(Token::Error("integer literal overflowed"), ""); 439 continue; 440 } 441 } 442 Token::SingleChar(b'-') => { 443 token = self.get_token(&mut none_str); 444 if let Token::Int(u) = token { 445 // Accept u <= 2147483648; anything larger will overflow i32 once negated. 446 if u <= std::i32::MAX as u32 { 447 ( 448 PrefType::Int, 449 PrefValue { 450 int_val: -(u as i32), 451 }, 452 ) 453 } else if u == std::i32::MAX as u32 + 1 { 454 ( 455 PrefType::Int, 456 PrefValue { 457 int_val: std::i32::MIN, 458 }, 459 ) 460 } else { 461 token = self 462 .error_and_recover(Token::Error("integer literal overflowed"), ""); 463 continue; 464 } 465 } else { 466 token = self.error_and_recover(token, "expected integer literal after '-'"); 467 continue; 468 } 469 } 470 Token::SingleChar(b'+') => { 471 token = self.get_token(&mut none_str); 472 if let Token::Int(u) = token { 473 // Accept u <= 2147483647; anything larger will overflow i32. 474 if u <= std::i32::MAX as u32 { 475 (PrefType::Int, PrefValue { int_val: u as i32 }) 476 } else { 477 token = self 478 .error_and_recover(Token::Error("integer literal overflowed"), ""); 479 continue; 480 } 481 } else { 482 token = self.error_and_recover(token, "expected integer literal after '+'"); 483 continue; 484 } 485 } 486 _ => { 487 token = self.error_and_recover(token, "expected pref value after ','"); 488 continue; 489 } 490 }; 491 492 // ("," <pref-attr>)* // default pref files only 493 let mut is_locked = false; 494 let mut has_attrs = false; 495 if self.kind == PrefValueKind::Default { 496 let ok = loop { 497 // "," 498 token = self.get_token(&mut none_str); 499 if token != Token::SingleChar(b',') { 500 break true; 501 } 502 503 // <pref-attr> 504 token = self.get_token(&mut none_str); 505 match token { 506 Token::Sticky => is_sticky = true, 507 Token::Locked => is_locked = true, 508 _ => { 509 token = 510 self.error_and_recover(token, "expected pref attribute after ','"); 511 break false; 512 } 513 } 514 has_attrs = true; 515 }; 516 if !ok { 517 continue; 518 } 519 } else { 520 token = self.get_token(&mut none_str); 521 } 522 523 // ")" 524 if token != Token::SingleChar(b')') { 525 let expected_msg = if self.kind == PrefValueKind::Default { 526 if has_attrs { 527 "expected ',' or ')' after pref attribute" 528 } else { 529 "expected ',' or ')' after pref value" 530 } 531 } else { 532 "expected ')' after pref value" 533 }; 534 token = self.error_and_recover(token, expected_msg); 535 continue; 536 } 537 538 // ";" 539 token = self.get_token(&mut none_str); 540 if token != Token::SingleChar(b';') { 541 token = self.error_and_recover(token, "expected ';' after ')'"); 542 continue; 543 } 544 545 unsafe { 546 (self.pref_fn)( 547 pref_name.as_ptr() as *const c_char, 548 pref_type, 549 pref_value_kind, 550 pref_value, 551 is_sticky, 552 is_locked, 553 ) 554 }; 555 556 token = self.get_token(&mut none_str); 557 } 558 } 559 560 fn error_and_recover(&mut self, token: Token, msg: &str) -> Token { 561 self.has_errors = true; 562 563 // If `token` is a Token::{Error,ErrorAtLine}, it's a lexing error and 564 // the error message is within `token`. Otherwise, it's a parsing error 565 // and the error message is in `msg`. 566 let (msg, line_num) = match token { 567 Token::Error(token_msg) => (token_msg, self.line_num), 568 Token::ErrorAtLine(token_msg, line_num) => (token_msg, line_num), 569 _ => (msg, self.line_num), 570 }; 571 let msg = format!("{}:{}: prefs parse error: {}", self.path, line_num, msg); 572 let msg = std::ffi::CString::new(msg).unwrap(); 573 unsafe { (self.error_fn)(msg.as_ptr() as *const c_char) }; 574 575 // "Panic-mode" recovery: consume tokens until one of the following 576 // occurs. 577 // - We hit a semicolon, whereupon we return the following token. 578 // - We hit EOF, whereupon we return EOF. 579 // 580 // For this to work, if the lexing functions hit EOF in an error case 581 // they must unget it so we can safely reget it here. 582 // 583 // If the starting token (passed in above) is EOF we must not get 584 // another token otherwise we will read past the end of `self.buf`. 585 let mut dummy_str = Vec::with_capacity(128); 586 let mut token = token; 587 loop { 588 match token { 589 Token::SingleChar(b';') => return self.get_token(&mut dummy_str), 590 Token::SingleChar(EOF) => return token, 591 _ => {} 592 } 593 token = self.get_token(&mut dummy_str); 594 } 595 } 596 597 #[inline(always)] 598 fn get_char(&mut self) -> u8 { 599 // We do the bounds check ourselves so we can return EOF on failure. 600 // (Although the buffer is guaranteed to end in an EOF char, we might 601 // go one char past that, whereupon we must return EOF again.) 602 if self.i < self.buf.len() { 603 let c = unsafe { *self.buf.get_unchecked(self.i) }; 604 self.i += 1; 605 c 606 } else { 607 debug_assert!(self.i == self.buf.len()); 608 EOF 609 } 610 } 611 612 // This function skips the bounds check in optimized builds. Using it at 613 // the hottest two call sites gives a ~15% parsing speed boost. 614 #[inline(always)] 615 unsafe fn get_char_unchecked(&mut self) -> u8 { 616 debug_assert!(self.i < self.buf.len()); 617 let c = *self.buf.get_unchecked(self.i); 618 self.i += 1; 619 c 620 } 621 622 #[inline(always)] 623 fn unget_char(&mut self) { 624 debug_assert!(self.i > 0); 625 self.i -= 1; 626 } 627 628 #[inline(always)] 629 fn match_char(&mut self, c: u8) -> bool { 630 if self.buf[self.i] == c { 631 self.i += 1; 632 return true; 633 } 634 false 635 } 636 637 #[inline(always)] 638 fn match_single_line_comment(&mut self) { 639 loop { 640 // To reach here, the previous char must have been '/' (if this is 641 // the first loop iteration) or non-special (if this is the second 642 // or subsequent iteration), and assertions elsewhere ensure that 643 // there must be at least one subsequent char after those chars 644 // (the '\0' for EOF). 645 let c = unsafe { self.get_char_unchecked() }; 646 647 // All the special chars have value <= b'\r'. 648 if c > b'\r' { 649 continue; 650 } 651 match c { 652 b'\n' => { 653 self.line_num += 1; 654 break; 655 } 656 b'\r' => { 657 self.line_num += 1; 658 self.match_char(b'\n'); 659 break; 660 } 661 EOF => { 662 break; 663 } 664 _ => continue, 665 } 666 } 667 } 668 669 // Returns false if we hit EOF without closing the comment. 670 fn match_multi_line_comment(&mut self) -> bool { 671 loop { 672 match self.get_char() { 673 b'*' => { 674 if self.match_char(b'/') { 675 return true; 676 } 677 } 678 b'\n' => { 679 self.line_num += 1; 680 } 681 b'\r' => { 682 self.line_num += 1; 683 self.match_char(b'\n'); 684 } 685 EOF => return false, 686 _ => continue, 687 } 688 } 689 } 690 691 fn match_hex_digits(&mut self, ndigits: i32) -> Option<u16> { 692 debug_assert!(ndigits == 2 || ndigits == 4); 693 let mut value: u16 = 0; 694 for _ in 0..ndigits { 695 value = value << 4; 696 match self.get_char() { 697 c @ b'0'..=b'9' => value += (c - b'0') as u16, 698 c @ b'A'..=b'F' => value += (c - b'A') as u16 + 10, 699 c @ b'a'..=b'f' => value += (c - b'a') as u16 + 10, 700 _ => { 701 self.unget_char(); 702 return None; 703 } 704 } 705 } 706 Some(value) 707 } 708 709 #[inline(always)] 710 fn char_kind(c: u8) -> CharKind { 711 // Use get_unchecked() because a u8 index cannot exceed this table's 712 // bounds. 713 unsafe { *CHAR_KINDS.get_unchecked(c as usize) } 714 } 715 716 #[inline(always)] 717 fn is_special_string_char(c: u8) -> bool { 718 // Use get_unchecked() because a u8 index cannot exceed this table's 719 // bounds. 720 unsafe { *SPECIAL_STRING_CHARS.get_unchecked(c as usize) } 721 } 722 723 // If the obtained Token has a value, it is put within the Token, unless 724 // it's a string, in which case it's put in `str_buf`. This avoids 725 // allocating a new Vec for every string, which is slow. 726 fn get_token(&mut self, str_buf: &mut Vec<u8>) -> Token { 727 loop { 728 // Note: the following tests are ordered by frequency when parsing 729 // greprefs.js: 730 // - SingleChar 36.7% 731 // - SpaceNL 27.7% (14.9% for spaces, 12.8% for NL) 732 // - Keyword 13.4% 733 // - Quote 11.4% 734 // - Slash 8.1% 735 // - Digit 2.7% 736 // - Hash, CR, Other 0.0% 737 738 let c = self.get_char(); 739 match Parser::char_kind(c) { 740 CharKind::SingleChar => { 741 return Token::SingleChar(c); 742 } 743 CharKind::SpaceNL => { 744 // It's slightly faster to combine the handling of the 745 // space chars with NL than to handle them separately; we 746 // have an extra test for this case, but one fewer test for 747 // all the subsequent CharKinds. 748 if c == b'\n' { 749 self.line_num += 1; 750 } 751 continue; 752 } 753 CharKind::Keyword => { 754 let start = self.i - 1; 755 loop { 756 let c = self.get_char(); 757 if Parser::char_kind(c) != CharKind::Keyword { 758 self.unget_char(); 759 break; 760 } 761 } 762 for info in KEYWORD_INFOS.iter() { 763 if &self.buf[start..self.i] == info.string { 764 return info.token; 765 } 766 } 767 return Token::Error("unknown keyword"); 768 } 769 CharKind::Quote => { 770 return self.get_string_token(c, str_buf); 771 } 772 CharKind::Slash => { 773 match self.get_char() { 774 b'/' => { 775 self.match_single_line_comment(); 776 } 777 b'*' => { 778 if !self.match_multi_line_comment() { 779 return Token::Error("unterminated /* comment"); 780 } 781 } 782 c @ _ => { 783 if c == b'\n' || c == b'\r' { 784 // Unget the newline char; the outer loop will 785 // reget it and adjust self.line_num 786 // appropriately. 787 self.unget_char(); 788 } 789 return Token::Error("expected '/' or '*' after '/'"); 790 } 791 } 792 continue; 793 } 794 CharKind::Digit => { 795 let mut value = Some((c - b'0') as u32); 796 loop { 797 let c = self.get_char(); 798 match Parser::char_kind(c) { 799 CharKind::Digit => { 800 fn add_digit(value: Option<u32>, c: u8) -> Option<u32> { 801 value?.checked_mul(10)?.checked_add((c - b'0') as u32) 802 } 803 value = add_digit(value, c); 804 } 805 CharKind::Keyword => { 806 // Reject things like "123foo". Error recovery 807 // will retokenize from "foo" onward. 808 self.unget_char(); 809 return Token::Error("unexpected character in integer literal"); 810 } 811 _ => { 812 self.unget_char(); 813 break; 814 } 815 } 816 } 817 return match value { 818 Some(v) => Token::Int(v), 819 None => Token::Error("integer literal overflowed"), 820 }; 821 } 822 CharKind::Hash => { 823 self.match_single_line_comment(); 824 continue; 825 } 826 CharKind::CR => { 827 self.match_char(b'\n'); 828 self.line_num += 1; 829 continue; 830 } 831 // Error recovery will retokenize from the next character. 832 _ => return Token::Error("unexpected character"), 833 } 834 } 835 } 836 837 fn string_error_token(&self, token: &mut Token, msg: &'static str) { 838 // We only want to capture the first tokenization error within a string. 839 if *token == Token::String { 840 *token = Token::ErrorAtLine(msg, self.line_num); 841 } 842 } 843 844 // Always inline this because it has a single call site. 845 #[inline(always)] 846 fn get_string_token(&mut self, quote_char: u8, str_buf: &mut Vec<u8>) -> Token { 847 // First scan through the string to see if it contains any chars that 848 // need special handling. 849 let start = self.i; 850 let has_special_chars = loop { 851 // To reach here, the previous char must have been a quote 852 // (quote_char), and assertions elsewhere ensure that there must be 853 // at least one subsequent char (the '\0' for EOF). 854 let c = unsafe { self.get_char_unchecked() }; 855 if Parser::is_special_string_char(c) { 856 break c != quote_char; 857 } 858 }; 859 860 // Clear str_buf's contents without changing its capacity. 861 str_buf.clear(); 862 863 // If there are no special chars (the common case), we can bulk copy it 864 // to str_buf. This is a lot faster than the char-by-char loop below. 865 if !has_special_chars { 866 str_buf.extend(&self.buf[start..self.i - 1]); 867 str_buf.push(b'\0'); 868 return Token::String; 869 } 870 871 // There were special chars. Re-scan the string, filling in str_buf one 872 // char at a time. 873 // 874 // On error, we change `token` to an error token and then keep going to 875 // the end of the string literal. `str_buf` won't be used in that case. 876 self.i = start; 877 let mut token = Token::String; 878 879 loop { 880 let c = self.get_char(); 881 let c2 = if !Parser::is_special_string_char(c) { 882 c 883 } else if c == quote_char { 884 break; 885 } else if c == b'\\' { 886 match self.get_char() { 887 b'\"' => b'\"', 888 b'\'' => b'\'', 889 b'\\' => b'\\', 890 b'n' => b'\n', 891 b'r' => b'\r', 892 b'x' => { 893 if let Some(value) = self.match_hex_digits(2) { 894 debug_assert!(value <= 0xff); 895 if value != 0 { 896 value as u8 897 } else { 898 self.string_error_token(&mut token, "\\x00 is not allowed"); 899 continue; 900 } 901 } else { 902 self.string_error_token(&mut token, "malformed \\x escape sequence"); 903 continue; 904 } 905 } 906 b'u' => { 907 if let Some(value) = self.match_hex_digits(4) { 908 let mut utf16 = vec![value]; 909 if 0xd800 == (0xfc00 & value) { 910 // High surrogate value. Look for the low surrogate value. 911 if self.match_char(b'\\') && self.match_char(b'u') { 912 if let Some(lo) = self.match_hex_digits(4) { 913 if 0xdc00 == (0xfc00 & lo) { 914 // Found a valid low surrogate. 915 utf16.push(lo); 916 } else { 917 self.string_error_token( 918 &mut token, 919 "invalid low surrogate after high surrogate", 920 ); 921 continue; 922 } 923 } 924 } 925 if utf16.len() != 2 { 926 self.string_error_token( 927 &mut token, 928 "expected low surrogate after high surrogate", 929 ); 930 continue; 931 } 932 } else if 0xdc00 == (0xfc00 & value) { 933 // Unaccompanied low surrogate value. 934 self.string_error_token( 935 &mut token, 936 "expected high surrogate before low surrogate", 937 ); 938 continue; 939 } else if value == 0 { 940 self.string_error_token(&mut token, "\\u0000 is not allowed"); 941 continue; 942 } 943 944 // Insert the UTF-16 sequence as UTF-8. 945 let utf8 = String::from_utf16(&utf16).unwrap(); 946 str_buf.extend(utf8.as_bytes()); 947 } else { 948 self.string_error_token(&mut token, "malformed \\u escape sequence"); 949 continue; 950 } 951 continue; // We don't want to str_buf.push(c2) below. 952 } 953 c @ _ => { 954 if c == b'\n' || c == b'\r' { 955 // Unget the newline char; the outer loop will 956 // reget it and adjust self.line_num appropriately. 957 self.unget_char(); 958 } 959 self.string_error_token( 960 &mut token, 961 "unexpected escape sequence character after '\\'", 962 ); 963 continue; 964 } 965 } 966 } else if c == b'\n' { 967 self.line_num += 1; 968 c 969 } else if c == b'\r' { 970 self.line_num += 1; 971 if self.match_char(b'\n') { 972 str_buf.push(b'\r'); 973 b'\n' 974 } else { 975 c 976 } 977 } else if c == EOF { 978 self.string_error_token(&mut token, "unterminated string literal"); 979 break; 980 } else { 981 // This case is only hit for the non-closing quote char. 982 debug_assert!((c == b'\'' || c == b'\"') && c != quote_char); 983 c 984 }; 985 str_buf.push(c2); 986 } 987 str_buf.push(b'\0'); 988 989 token 990 } 991 }