[ tor-browser ].git.dasho

lib.rs (41207B)
      1 /* This Source Code Form is subject to the terms of the Mozilla Public
      2 * License, v. 2.0. If a copy of the MPL was not distributed with this
      3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      4 
      5 //! This crate implements a prefs file parser.
      6 //!
      7 //! Pref files have the following grammar. Note that there are slight
      8 //! differences between the grammar for a default prefs files and a user prefs
      9 //! file.
     10 //!
     11 //! ```text
     12 //! <pref-file>   = <pref>*
     13 //! <pref>        = <pref-spec> "(" <pref-name> "," <pref-value> <pref-attrs> ")" ";"
     14 //! <pref-spec>   = "user_pref" | "pref" | "sticky_pref" // in default pref files
     15 //! <pref-spec>   = "user_pref"                          // in user pref files
     16 //! <pref-name>   = <string-literal>
     17 //! <pref-value>  = <string-literal> | "true" | "false" | <int-value>
     18 //! <int-value>   = <sign>? <int-literal>
     19 //! <sign>        = "+" | "-"
     20 //! <int-literal> = [0-9]+ (and cannot be followed by [A-Za-z_])
     21 //! <string-literal> =
     22 //!   A single or double-quoted string, with the following escape sequences
     23 //!   allowed: \", \', \\, \n, \r, \xNN, \uNNNN, where \xNN gives a raw byte
     24 //!   value that is copied directly into an 8-bit string value, and \uNNNN
     25 //!   gives a UTF-16 code unit that is converted to UTF-8 before being copied
     26 //!   into an 8-bit string value. \x00 and \u0000 are disallowed because they
     27 //!   would cause C++ code handling such strings to misbehave.
     28 //! <pref-attrs>  = ("," <pref-attr>)*      // in default pref files
     29 //!               = <empty>                 // in user pref files
     30 //! <pref-attr>   = "sticky" | "locked"     // default pref files only
     31 //! ```
     32 //!
     33 //! Comments can take three forms:
     34 //! - `# Python-style comments`
     35 //! - `// C++ style comments`
     36 //! - `/* C style comments (non-nested) */`
     37 //!
     38 //! Non-end-of-line whitespace chars are `\t`, `\v`, `\f`, and space.
     39 //!
     40 //! End-of-line sequences can take three forms, each of which is considered as
     41 //! a single EOL:
     42 //! - `\n`
     43 //! - `\r` (without subsequent `\n`)
     44 //! - `\r\n`
     45 //!
     46 //! The valid range for `<int-value>` is -2,147,483,648..2,147,483,647. Values
     47 //! outside that range will result in a parse error.
     48 //!
     49 //! A `\0` char is interpreted as the end of the file. The use of this character
     50 //! in a prefs file is not recommended. Within string literals `\x00` or
     51 //! `\u0000` can be used instead.
     52 //!
     53 //! The parser performs error recovery. On a syntax error, it will scan forward
     54 //! to the next `;` token and then continue parsing. If the syntax error occurs
     55 //! in the middle of a token, it will first finish obtaining the current token
     56 //! in an appropriate fashion.
     57 
     58 // This parser uses several important optimizations.
     59 //
     60 // - Because "`\0` means EOF" is part of the grammar (see above), EOF is
     61 //   representable by a u8. If EOF was represented by an out-of-band value such
     62 //   as -1 or 256, we'd have to return a larger type such as `u16` or `i16`
     63 //   from `get_char()`.
     64 //
     65 // - When starting a new token, it uses a lookup table with the first char,
     66 //   which quickly identifies what kind of token it will be. Furthermore, if
     67 //   that token is an unambiguous single-char token (e.g. `(`, `)`, `+`, `,`,
     68 //   `-`, `;`), the parser will return the appropriate token kind value at
     69 //   minimal cost because the single-char tokens have a uniform representation.
     70 //
     71 // - It has a lookup table that identifies chars in string literals that need
     72 //   special handling. This means non-special chars (the common case) can be
     73 //   handled with a single test, rather than testing for the multiple special
     74 //   cases.
     75 //
     76 // - It pre-scans string literals for special chars. If none are present, it
     77 //   bulk copies the string literal into a Vec, which is faster than doing a
     78 //   char-by-char copy.
     79 //
     80 // - It reuses Vecs to avoid creating a new one for each string literal.
     81 
     82 use std::os::raw::{c_char, c_uchar};
     83 
     84 //---------------------------------------------------------------------------
     85 // The public interface
     86 //---------------------------------------------------------------------------
     87 
     88 /// Keep this in sync with PrefType in Preferences.cpp.
     89 #[derive(Clone, Copy, Debug, PartialEq)]
     90 #[repr(u8)]
     91 pub enum PrefType {
     92    None,
     93    String,
     94    Int,
     95    Bool,
     96 }
     97 
     98 /// Keep this in sync with PrefValueKind in Preferences.h.
     99 #[derive(Clone, Copy, Debug, PartialEq)]
    100 #[repr(u8)]
    101 pub enum PrefValueKind {
    102    Default,
    103    User,
    104 }
    105 
    106 /// Keep this in sync with PrefValue in Preferences.cpp.
    107 #[repr(C)]
    108 pub union PrefValue {
    109    pub string_val: *const c_char,
    110    pub int_val: i32,
    111    pub bool_val: bool,
    112 }
    113 
    114 /// Keep this in sync with PrefsParserPrefFn in Preferences.cpp.
    115 type PrefFn = unsafe extern "C" fn(
    116    pref_name: *const c_char,
    117    pref_type: PrefType,
    118    pref_value_kind: PrefValueKind,
    119    pref_value: PrefValue,
    120    is_sticky: bool,
    121    is_locked: bool,
    122 );
    123 
    124 /// Keep this in sync with PrefsParserErrorFn in Preferences.cpp.
    125 type ErrorFn = unsafe extern "C" fn(msg: *const c_char);
    126 
    127 /// Parse the contents of a prefs file.
    128 ///
    129 /// `buf` is a null-terminated string. `len` is its length, excluding the
    130 /// null terminator.
    131 ///
    132 /// `pref_fn` is called once for each successfully parsed pref.
    133 ///
    134 /// `error_fn` is called once for each parse error detected.
    135 ///
    136 /// Keep this in sync with the prefs_parser_parse() declaration in
    137 /// Preferences.cpp.
    138 #[no_mangle]
    139 pub unsafe extern "C" fn prefs_parser_parse(
    140    path: *const c_char,
    141    kind: PrefValueKind,
    142    buf: *const c_char,
    143    len: usize,
    144    pref_fn: PrefFn,
    145    error_fn: ErrorFn,
    146 ) -> bool {
    147    let path = std::ffi::CStr::from_ptr(path)
    148        .to_string_lossy()
    149        .into_owned();
    150 
    151    // Make sure `buf` ends in a '\0', and include that in the length, because
    152    // it represents EOF.
    153    let buf = std::slice::from_raw_parts(buf as *const c_uchar, len + 1);
    154    assert!(buf.last() == Some(&EOF));
    155 
    156    let mut parser = Parser::new(&path, kind, &buf, pref_fn, error_fn);
    157    parser.parse()
    158 }
    159 
    160 //---------------------------------------------------------------------------
    161 // The implementation
    162 //---------------------------------------------------------------------------
    163 
    164 #[derive(Clone, Copy, Debug, PartialEq)]
    165 enum Token {
    166    // Unambiguous single-char tokens.
    167    SingleChar(u8),
    168 
    169    // Keywords
    170    Pref,       // pref
    171    StickyPref, // sticky_pref
    172    UserPref,   // user_pref
    173    True,       // true
    174    False,      // false
    175    Sticky,     // sticky
    176    Locked,     // locked
    177 
    178    // String literal, e.g. '"string"'. The value is stored elsewhere.
    179    String,
    180 
    181    // Unsigned integer literal, e.g. '123'. Although libpref uses i32 values,
    182    // any '-' and '+' before an integer literal are treated as separate
    183    // tokens, so these token values are always positive. Furthermore, we
    184    // tokenize int literals as u32 so that 2147483648 (which doesn't fit into
    185    // an i32) can be subsequently negated to -2147483648 (which does fit into
    186    // an i32) if a '-' token precedes it.
    187    Int(u32),
    188 
    189    // Malformed token.
    190    Error(&'static str),
    191 
    192    // Malformed token at a particular line number. For use when
    193    // Parser::line_num might not be the right line number when the error is
    194    // reported. E.g. if a multi-line string has a bad escape sequence on the
    195    // first line, we don't report the error until the string's end has been
    196    // reached.
    197    ErrorAtLine(&'static str, u32),
    198 }
    199 
    200 // We categorize every char by what action should be taken when it appears at
    201 // the start of a new token.
    202 #[derive(Clone, Copy, PartialEq)]
    203 enum CharKind {
    204    // These are ordered by frequency. See the comment in GetToken().
    205    SingleChar, // Unambiguous single-char tokens: [()+,-] or EOF
    206    SpaceNL,    // [\t\v\f \n]
    207    Keyword,    // [A-Za-z_]
    208    Quote,      // ["']
    209    Slash,      // /
    210    Digit,      // [0-9]
    211    Hash,       // #
    212    CR,         // \r
    213    Other,      // Everything else; invalid except within strings and comments.
    214 }
    215 
    216 const C_SINGL: CharKind = CharKind::SingleChar;
    217 const C_SPCNL: CharKind = CharKind::SpaceNL;
    218 const C_KEYWD: CharKind = CharKind::Keyword;
    219 const C_QUOTE: CharKind = CharKind::Quote;
    220 const C_SLASH: CharKind = CharKind::Slash;
    221 const C_DIGIT: CharKind = CharKind::Digit;
    222 const C_HASH_: CharKind = CharKind::Hash;
    223 const C_CR___: CharKind = CharKind::CR;
    224 const C______: CharKind = CharKind::Other;
    225 
    226 #[rustfmt::skip]
    227 const CHAR_KINDS: [CharKind; 256] = [
    228 /*         0        1        2        3        4        5        6        7        8        9    */
    229 /*   0+ */ C_SINGL, C______, C______, C______, C______, C______, C______, C______, C______, C_SPCNL,
    230 /*  10+ */ C_SPCNL, C_SPCNL, C_SPCNL, C_CR___, C______, C______, C______, C______, C______, C______,
    231 /*  20+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
    232 /*  30+ */ C______, C______, C_SPCNL, C______, C_QUOTE, C_HASH_, C______, C______, C______, C_QUOTE,
    233 /*  40+ */ C_SINGL, C_SINGL, C______, C_SINGL, C_SINGL, C_SINGL, C______, C_SLASH, C_DIGIT, C_DIGIT,
    234 /*  50+ */ C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C______, C_SINGL,
    235 /*  60+ */ C______, C______, C______, C______, C______, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD,
    236 /*  70+ */ C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD,
    237 /*  80+ */ C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD,
    238 /*  90+ */ C_KEYWD, C______, C______, C______, C______, C_KEYWD, C______, C_KEYWD, C_KEYWD, C_KEYWD,
    239 /* 100+ */ C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD,
    240 /* 110+ */ C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD, C_KEYWD,
    241 /* 120+ */ C_KEYWD, C_KEYWD, C_KEYWD, C______, C______, C______, C______, C______, C______, C______,
    242 /* 130+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
    243 /* 140+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
    244 /* 150+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
    245 /* 160+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
    246 /* 170+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
    247 /* 180+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
    248 /* 190+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
    249 /* 200+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
    250 /* 210+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
    251 /* 220+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
    252 /* 230+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
    253 /* 240+ */ C______, C______, C______, C______, C______, C______, C______, C______, C______, C______,
    254 /* 250+ */ C______, C______, C______, C______, C______, C______
    255 ];
    256 
    257 const _______: bool = false;
    258 #[rustfmt::skip]
    259 const SPECIAL_STRING_CHARS: [bool; 256] = [
    260 /*         0        1        2        3        4        5        6        7        8        9    */
    261 /*   0+ */    true, _______, _______, _______, _______, _______, _______, _______, _______, _______,
    262 /*  10+ */    true, _______, _______,    true, _______, _______, _______, _______, _______, _______,
    263 /*  20+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
    264 /*  30+ */ _______, _______, _______, _______,    true, _______, _______, _______, _______,    true,
    265 /*  40+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
    266 /*  50+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
    267 /*  60+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
    268 /*  70+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
    269 /*  80+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
    270 /*  90+ */ _______, _______,    true, _______, _______, _______, _______, _______, _______, _______,
    271 /* 100+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
    272 /* 110+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
    273 /* 120+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
    274 /* 130+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
    275 /* 140+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
    276 /* 150+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
    277 /* 160+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
    278 /* 170+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
    279 /* 180+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
    280 /* 190+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
    281 /* 200+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
    282 /* 210+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
    283 /* 220+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
    284 /* 230+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
    285 /* 240+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
    286 /* 250+ */ _______, _______, _______, _______, _______, _______
    287 ];
    288 
    289 struct KeywordInfo {
    290    string: &'static [u8],
    291    token: Token,
    292 }
    293 
    294 const KEYWORD_INFOS: [KeywordInfo; 7] = [
    295    // These are ordered by frequency.
    296    KeywordInfo {
    297        string: b"pref",
    298        token: Token::Pref,
    299    },
    300    KeywordInfo {
    301        string: b"true",
    302        token: Token::True,
    303    },
    304    KeywordInfo {
    305        string: b"false",
    306        token: Token::False,
    307    },
    308    KeywordInfo {
    309        string: b"user_pref",
    310        token: Token::UserPref,
    311    },
    312    KeywordInfo {
    313        string: b"sticky",
    314        token: Token::Sticky,
    315    },
    316    KeywordInfo {
    317        string: b"locked",
    318        token: Token::Locked,
    319    },
    320    KeywordInfo {
    321        string: b"sticky_pref",
    322        token: Token::StickyPref,
    323    },
    324 ];
    325 
    326 struct Parser<'t> {
    327    path: &'t str,       // Path to the file being parsed. Used in error messages.
    328    kind: PrefValueKind, // Default prefs file or user prefs file?
    329    buf: &'t [u8],       // Text being parsed.
    330    i: usize,            // Index of next char to be read.
    331    line_num: u32,       // Current line number within the text.
    332    pref_fn: PrefFn,     // Callback for processing each pref.
    333    error_fn: ErrorFn,   // Callback for parse errors.
    334    has_errors: bool,    // Have we encountered errors?
    335 }
    336 
    337 // As described above, we use 0 to represent EOF.
    338 const EOF: u8 = b'\0';
    339 
    340 impl<'t> Parser<'t> {
    341    fn new(
    342        path: &'t str,
    343        kind: PrefValueKind,
    344        buf: &'t [u8],
    345        pref_fn: PrefFn,
    346        error_fn: ErrorFn,
    347    ) -> Parser<'t> {
    348        // Make sure these tables take up 1 byte per entry.
    349        assert!(std::mem::size_of_val(&CHAR_KINDS) == 256);
    350        assert!(std::mem::size_of_val(&SPECIAL_STRING_CHARS) == 256);
    351 
    352        Parser {
    353            path: path,
    354            kind: kind,
    355            buf: buf,
    356            i: 0,
    357            line_num: 1,
    358            pref_fn: pref_fn,
    359            error_fn: error_fn,
    360            has_errors: false,
    361        }
    362    }
    363 
    364    fn parse(&mut self) -> bool {
    365        // These are reused, because allocating a new Vec for every string is slow.
    366        let mut name_str = Vec::with_capacity(128); // For pref names.
    367        let mut value_str = Vec::with_capacity(512); // For string pref values.
    368        let mut none_str = Vec::with_capacity(0); // For tokens that shouldn't be strings.
    369 
    370        let mut token = self.get_token(&mut none_str);
    371 
    372        // At the top of the loop we already have a token. In a valid input
    373        // this will be either the first token of a new pref, or EOF.
    374        loop {
    375            // <pref-spec>
    376            let (pref_value_kind, mut is_sticky) = match token {
    377                Token::Pref if self.kind == PrefValueKind::Default => {
    378                    (PrefValueKind::Default, false)
    379                }
    380                Token::StickyPref if self.kind == PrefValueKind::Default => {
    381                    (PrefValueKind::Default, true)
    382                }
    383                Token::UserPref => (PrefValueKind::User, false),
    384                Token::SingleChar(EOF) => return !self.has_errors,
    385                _ => {
    386                    token = self.error_and_recover(
    387                        token,
    388                        if self.kind == PrefValueKind::Default {
    389                            "expected pref specifier at start of pref definition"
    390                        } else {
    391                            "expected 'user_pref' at start of pref definition"
    392                        },
    393                    );
    394                    continue;
    395                }
    396            };
    397 
    398            // "("
    399            token = self.get_token(&mut none_str);
    400            if token != Token::SingleChar(b'(') {
    401                token = self.error_and_recover(token, "expected '(' after pref specifier");
    402                continue;
    403            }
    404 
    405            // <pref-name>
    406            token = self.get_token(&mut name_str);
    407            let pref_name = if token == Token::String {
    408                &name_str
    409            } else {
    410                token = self.error_and_recover(token, "expected pref name after '('");
    411                continue;
    412            };
    413 
    414            // ","
    415            token = self.get_token(&mut none_str);
    416            if token != Token::SingleChar(b',') {
    417                token = self.error_and_recover(token, "expected ',' after pref name");
    418                continue;
    419            }
    420 
    421            // <pref-value>
    422            token = self.get_token(&mut value_str);
    423            let (pref_type, pref_value) = match token {
    424                Token::True => (PrefType::Bool, PrefValue { bool_val: true }),
    425                Token::False => (PrefType::Bool, PrefValue { bool_val: false }),
    426                Token::String => (
    427                    PrefType::String,
    428                    PrefValue {
    429                        string_val: value_str.as_ptr() as *const c_char,
    430                    },
    431                ),
    432                Token::Int(u) => {
    433                    // Accept u <= 2147483647; anything larger will overflow i32.
    434                    if u <= std::i32::MAX as u32 {
    435                        (PrefType::Int, PrefValue { int_val: u as i32 })
    436                    } else {
    437                        token =
    438                            self.error_and_recover(Token::Error("integer literal overflowed"), "");
    439                        continue;
    440                    }
    441                }
    442                Token::SingleChar(b'-') => {
    443                    token = self.get_token(&mut none_str);
    444                    if let Token::Int(u) = token {
    445                        // Accept u <= 2147483648; anything larger will overflow i32 once negated.
    446                        if u <= std::i32::MAX as u32 {
    447                            (
    448                                PrefType::Int,
    449                                PrefValue {
    450                                    int_val: -(u as i32),
    451                                },
    452                            )
    453                        } else if u == std::i32::MAX as u32 + 1 {
    454                            (
    455                                PrefType::Int,
    456                                PrefValue {
    457                                    int_val: std::i32::MIN,
    458                                },
    459                            )
    460                        } else {
    461                            token = self
    462                                .error_and_recover(Token::Error("integer literal overflowed"), "");
    463                            continue;
    464                        }
    465                    } else {
    466                        token = self.error_and_recover(token, "expected integer literal after '-'");
    467                        continue;
    468                    }
    469                }
    470                Token::SingleChar(b'+') => {
    471                    token = self.get_token(&mut none_str);
    472                    if let Token::Int(u) = token {
    473                        // Accept u <= 2147483647; anything larger will overflow i32.
    474                        if u <= std::i32::MAX as u32 {
    475                            (PrefType::Int, PrefValue { int_val: u as i32 })
    476                        } else {
    477                            token = self
    478                                .error_and_recover(Token::Error("integer literal overflowed"), "");
    479                            continue;
    480                        }
    481                    } else {
    482                        token = self.error_and_recover(token, "expected integer literal after '+'");
    483                        continue;
    484                    }
    485                }
    486                _ => {
    487                    token = self.error_and_recover(token, "expected pref value after ','");
    488                    continue;
    489                }
    490            };
    491 
    492            // ("," <pref-attr>)*   // default pref files only
    493            let mut is_locked = false;
    494            let mut has_attrs = false;
    495            if self.kind == PrefValueKind::Default {
    496                let ok = loop {
    497                    // ","
    498                    token = self.get_token(&mut none_str);
    499                    if token != Token::SingleChar(b',') {
    500                        break true;
    501                    }
    502 
    503                    // <pref-attr>
    504                    token = self.get_token(&mut none_str);
    505                    match token {
    506                        Token::Sticky => is_sticky = true,
    507                        Token::Locked => is_locked = true,
    508                        _ => {
    509                            token =
    510                                self.error_and_recover(token, "expected pref attribute after ','");
    511                            break false;
    512                        }
    513                    }
    514                    has_attrs = true;
    515                };
    516                if !ok {
    517                    continue;
    518                }
    519            } else {
    520                token = self.get_token(&mut none_str);
    521            }
    522 
    523            // ")"
    524            if token != Token::SingleChar(b')') {
    525                let expected_msg = if self.kind == PrefValueKind::Default {
    526                    if has_attrs {
    527                        "expected ',' or ')' after pref attribute"
    528                    } else {
    529                        "expected ',' or ')' after pref value"
    530                    }
    531                } else {
    532                    "expected ')' after pref value"
    533                };
    534                token = self.error_and_recover(token, expected_msg);
    535                continue;
    536            }
    537 
    538            // ";"
    539            token = self.get_token(&mut none_str);
    540            if token != Token::SingleChar(b';') {
    541                token = self.error_and_recover(token, "expected ';' after ')'");
    542                continue;
    543            }
    544 
    545            unsafe {
    546                (self.pref_fn)(
    547                    pref_name.as_ptr() as *const c_char,
    548                    pref_type,
    549                    pref_value_kind,
    550                    pref_value,
    551                    is_sticky,
    552                    is_locked,
    553                )
    554            };
    555 
    556            token = self.get_token(&mut none_str);
    557        }
    558    }
    559 
    560    fn error_and_recover(&mut self, token: Token, msg: &str) -> Token {
    561        self.has_errors = true;
    562 
    563        // If `token` is a Token::{Error,ErrorAtLine}, it's a lexing error and
    564        // the error message is within `token`. Otherwise, it's a parsing error
    565        // and the error message is in `msg`.
    566        let (msg, line_num) = match token {
    567            Token::Error(token_msg) => (token_msg, self.line_num),
    568            Token::ErrorAtLine(token_msg, line_num) => (token_msg, line_num),
    569            _ => (msg, self.line_num),
    570        };
    571        let msg = format!("{}:{}: prefs parse error: {}", self.path, line_num, msg);
    572        let msg = std::ffi::CString::new(msg).unwrap();
    573        unsafe { (self.error_fn)(msg.as_ptr() as *const c_char) };
    574 
    575        // "Panic-mode" recovery: consume tokens until one of the following
    576        // occurs.
    577        // - We hit a semicolon, whereupon we return the following token.
    578        // - We hit EOF, whereupon we return EOF.
    579        //
    580        // For this to work, if the lexing functions hit EOF in an error case
    581        // they must unget it so we can safely reget it here.
    582        //
    583        // If the starting token (passed in above) is EOF we must not get
    584        // another token otherwise we will read past the end of `self.buf`.
    585        let mut dummy_str = Vec::with_capacity(128);
    586        let mut token = token;
    587        loop {
    588            match token {
    589                Token::SingleChar(b';') => return self.get_token(&mut dummy_str),
    590                Token::SingleChar(EOF) => return token,
    591                _ => {}
    592            }
    593            token = self.get_token(&mut dummy_str);
    594        }
    595    }
    596 
    597    #[inline(always)]
    598    fn get_char(&mut self) -> u8 {
    599        // We do the bounds check ourselves so we can return EOF on failure.
    600        // (Although the buffer is guaranteed to end in an EOF char, we might
    601        // go one char past that, whereupon we must return EOF again.)
    602        if self.i < self.buf.len() {
    603            let c = unsafe { *self.buf.get_unchecked(self.i) };
    604            self.i += 1;
    605            c
    606        } else {
    607            debug_assert!(self.i == self.buf.len());
    608            EOF
    609        }
    610    }
    611 
    612    // This function skips the bounds check in optimized builds. Using it at
    613    // the hottest two call sites gives a ~15% parsing speed boost.
    614    #[inline(always)]
    615    unsafe fn get_char_unchecked(&mut self) -> u8 {
    616        debug_assert!(self.i < self.buf.len());
    617        let c = *self.buf.get_unchecked(self.i);
    618        self.i += 1;
    619        c
    620    }
    621 
    622    #[inline(always)]
    623    fn unget_char(&mut self) {
    624        debug_assert!(self.i > 0);
    625        self.i -= 1;
    626    }
    627 
    628    #[inline(always)]
    629    fn match_char(&mut self, c: u8) -> bool {
    630        if self.buf[self.i] == c {
    631            self.i += 1;
    632            return true;
    633        }
    634        false
    635    }
    636 
    637    #[inline(always)]
    638    fn match_single_line_comment(&mut self) {
    639        loop {
    640            // To reach here, the previous char must have been '/' (if this is
    641            // the first loop iteration) or non-special (if this is the second
    642            // or subsequent iteration), and assertions elsewhere ensure that
    643            // there must be at least one subsequent char after those chars
    644            // (the '\0' for EOF).
    645            let c = unsafe { self.get_char_unchecked() };
    646 
    647            // All the special chars have value <= b'\r'.
    648            if c > b'\r' {
    649                continue;
    650            }
    651            match c {
    652                b'\n' => {
    653                    self.line_num += 1;
    654                    break;
    655                }
    656                b'\r' => {
    657                    self.line_num += 1;
    658                    self.match_char(b'\n');
    659                    break;
    660                }
    661                EOF => {
    662                    break;
    663                }
    664                _ => continue,
    665            }
    666        }
    667    }
    668 
    669    // Returns false if we hit EOF without closing the comment.
    670    fn match_multi_line_comment(&mut self) -> bool {
    671        loop {
    672            match self.get_char() {
    673                b'*' => {
    674                    if self.match_char(b'/') {
    675                        return true;
    676                    }
    677                }
    678                b'\n' => {
    679                    self.line_num += 1;
    680                }
    681                b'\r' => {
    682                    self.line_num += 1;
    683                    self.match_char(b'\n');
    684                }
    685                EOF => return false,
    686                _ => continue,
    687            }
    688        }
    689    }
    690 
    691    fn match_hex_digits(&mut self, ndigits: i32) -> Option<u16> {
    692        debug_assert!(ndigits == 2 || ndigits == 4);
    693        let mut value: u16 = 0;
    694        for _ in 0..ndigits {
    695            value = value << 4;
    696            match self.get_char() {
    697                c @ b'0'..=b'9' => value += (c - b'0') as u16,
    698                c @ b'A'..=b'F' => value += (c - b'A') as u16 + 10,
    699                c @ b'a'..=b'f' => value += (c - b'a') as u16 + 10,
    700                _ => {
    701                    self.unget_char();
    702                    return None;
    703                }
    704            }
    705        }
    706        Some(value)
    707    }
    708 
    709    #[inline(always)]
    710    fn char_kind(c: u8) -> CharKind {
    711        // Use get_unchecked() because a u8 index cannot exceed this table's
    712        // bounds.
    713        unsafe { *CHAR_KINDS.get_unchecked(c as usize) }
    714    }
    715 
    716    #[inline(always)]
    717    fn is_special_string_char(c: u8) -> bool {
    718        // Use get_unchecked() because a u8 index cannot exceed this table's
    719        // bounds.
    720        unsafe { *SPECIAL_STRING_CHARS.get_unchecked(c as usize) }
    721    }
    722 
    723    // If the obtained Token has a value, it is put within the Token, unless
    724    // it's a string, in which case it's put in `str_buf`. This avoids
    725    // allocating a new Vec for every string, which is slow.
    726    fn get_token(&mut self, str_buf: &mut Vec<u8>) -> Token {
    727        loop {
    728            // Note: the following tests are ordered by frequency when parsing
    729            // greprefs.js:
    730            // - SingleChar      36.7%
    731            // - SpaceNL         27.7% (14.9% for spaces, 12.8% for NL)
    732            // - Keyword         13.4%
    733            // - Quote           11.4%
    734            // - Slash            8.1%
    735            // - Digit            2.7%
    736            // - Hash, CR, Other  0.0%
    737 
    738            let c = self.get_char();
    739            match Parser::char_kind(c) {
    740                CharKind::SingleChar => {
    741                    return Token::SingleChar(c);
    742                }
    743                CharKind::SpaceNL => {
    744                    // It's slightly faster to combine the handling of the
    745                    // space chars with NL than to handle them separately; we
    746                    // have an extra test for this case, but one fewer test for
    747                    // all the subsequent CharKinds.
    748                    if c == b'\n' {
    749                        self.line_num += 1;
    750                    }
    751                    continue;
    752                }
    753                CharKind::Keyword => {
    754                    let start = self.i - 1;
    755                    loop {
    756                        let c = self.get_char();
    757                        if Parser::char_kind(c) != CharKind::Keyword {
    758                            self.unget_char();
    759                            break;
    760                        }
    761                    }
    762                    for info in KEYWORD_INFOS.iter() {
    763                        if &self.buf[start..self.i] == info.string {
    764                            return info.token;
    765                        }
    766                    }
    767                    return Token::Error("unknown keyword");
    768                }
    769                CharKind::Quote => {
    770                    return self.get_string_token(c, str_buf);
    771                }
    772                CharKind::Slash => {
    773                    match self.get_char() {
    774                        b'/' => {
    775                            self.match_single_line_comment();
    776                        }
    777                        b'*' => {
    778                            if !self.match_multi_line_comment() {
    779                                return Token::Error("unterminated /* comment");
    780                            }
    781                        }
    782                        c @ _ => {
    783                            if c == b'\n' || c == b'\r' {
    784                                // Unget the newline char; the outer loop will
    785                                // reget it and adjust self.line_num
    786                                // appropriately.
    787                                self.unget_char();
    788                            }
    789                            return Token::Error("expected '/' or '*' after '/'");
    790                        }
    791                    }
    792                    continue;
    793                }
    794                CharKind::Digit => {
    795                    let mut value = Some((c - b'0') as u32);
    796                    loop {
    797                        let c = self.get_char();
    798                        match Parser::char_kind(c) {
    799                            CharKind::Digit => {
    800                                fn add_digit(value: Option<u32>, c: u8) -> Option<u32> {
    801                                    value?.checked_mul(10)?.checked_add((c - b'0') as u32)
    802                                }
    803                                value = add_digit(value, c);
    804                            }
    805                            CharKind::Keyword => {
    806                                // Reject things like "123foo". Error recovery
    807                                // will retokenize from "foo" onward.
    808                                self.unget_char();
    809                                return Token::Error("unexpected character in integer literal");
    810                            }
    811                            _ => {
    812                                self.unget_char();
    813                                break;
    814                            }
    815                        }
    816                    }
    817                    return match value {
    818                        Some(v) => Token::Int(v),
    819                        None => Token::Error("integer literal overflowed"),
    820                    };
    821                }
    822                CharKind::Hash => {
    823                    self.match_single_line_comment();
    824                    continue;
    825                }
    826                CharKind::CR => {
    827                    self.match_char(b'\n');
    828                    self.line_num += 1;
    829                    continue;
    830                }
    831                // Error recovery will retokenize from the next character.
    832                _ => return Token::Error("unexpected character"),
    833            }
    834        }
    835    }
    836 
    837    fn string_error_token(&self, token: &mut Token, msg: &'static str) {
    838        // We only want to capture the first tokenization error within a string.
    839        if *token == Token::String {
    840            *token = Token::ErrorAtLine(msg, self.line_num);
    841        }
    842    }
    843 
    844    // Always inline this because it has a single call site.
    845    #[inline(always)]
    846    fn get_string_token(&mut self, quote_char: u8, str_buf: &mut Vec<u8>) -> Token {
    847        // First scan through the string to see if it contains any chars that
    848        // need special handling.
    849        let start = self.i;
    850        let has_special_chars = loop {
    851            // To reach here, the previous char must have been a quote
    852            // (quote_char), and assertions elsewhere ensure that there must be
    853            // at least one subsequent char (the '\0' for EOF).
    854            let c = unsafe { self.get_char_unchecked() };
    855            if Parser::is_special_string_char(c) {
    856                break c != quote_char;
    857            }
    858        };
    859 
    860        // Clear str_buf's contents without changing its capacity.
    861        str_buf.clear();
    862 
    863        // If there are no special chars (the common case), we can bulk copy it
    864        // to str_buf. This is a lot faster than the char-by-char loop below.
    865        if !has_special_chars {
    866            str_buf.extend(&self.buf[start..self.i - 1]);
    867            str_buf.push(b'\0');
    868            return Token::String;
    869        }
    870 
    871        // There were special chars. Re-scan the string, filling in str_buf one
    872        // char at a time.
    873        //
    874        // On error, we change `token` to an error token and then keep going to
    875        // the end of the string literal. `str_buf` won't be used in that case.
    876        self.i = start;
    877        let mut token = Token::String;
    878 
    879        loop {
    880            let c = self.get_char();
    881            let c2 = if !Parser::is_special_string_char(c) {
    882                c
    883            } else if c == quote_char {
    884                break;
    885            } else if c == b'\\' {
    886                match self.get_char() {
    887                    b'\"' => b'\"',
    888                    b'\'' => b'\'',
    889                    b'\\' => b'\\',
    890                    b'n' => b'\n',
    891                    b'r' => b'\r',
    892                    b'x' => {
    893                        if let Some(value) = self.match_hex_digits(2) {
    894                            debug_assert!(value <= 0xff);
    895                            if value != 0 {
    896                                value as u8
    897                            } else {
    898                                self.string_error_token(&mut token, "\\x00 is not allowed");
    899                                continue;
    900                            }
    901                        } else {
    902                            self.string_error_token(&mut token, "malformed \\x escape sequence");
    903                            continue;
    904                        }
    905                    }
    906                    b'u' => {
    907                        if let Some(value) = self.match_hex_digits(4) {
    908                            let mut utf16 = vec![value];
    909                            if 0xd800 == (0xfc00 & value) {
    910                                // High surrogate value. Look for the low surrogate value.
    911                                if self.match_char(b'\\') && self.match_char(b'u') {
    912                                    if let Some(lo) = self.match_hex_digits(4) {
    913                                        if 0xdc00 == (0xfc00 & lo) {
    914                                            // Found a valid low surrogate.
    915                                            utf16.push(lo);
    916                                        } else {
    917                                            self.string_error_token(
    918                                                &mut token,
    919                                                "invalid low surrogate after high surrogate",
    920                                            );
    921                                            continue;
    922                                        }
    923                                    }
    924                                }
    925                                if utf16.len() != 2 {
    926                                    self.string_error_token(
    927                                        &mut token,
    928                                        "expected low surrogate after high surrogate",
    929                                    );
    930                                    continue;
    931                                }
    932                            } else if 0xdc00 == (0xfc00 & value) {
    933                                // Unaccompanied low surrogate value.
    934                                self.string_error_token(
    935                                    &mut token,
    936                                    "expected high surrogate before low surrogate",
    937                                );
    938                                continue;
    939                            } else if value == 0 {
    940                                self.string_error_token(&mut token, "\\u0000 is not allowed");
    941                                continue;
    942                            }
    943 
    944                            // Insert the UTF-16 sequence as UTF-8.
    945                            let utf8 = String::from_utf16(&utf16).unwrap();
    946                            str_buf.extend(utf8.as_bytes());
    947                        } else {
    948                            self.string_error_token(&mut token, "malformed \\u escape sequence");
    949                            continue;
    950                        }
    951                        continue; // We don't want to str_buf.push(c2) below.
    952                    }
    953                    c @ _ => {
    954                        if c == b'\n' || c == b'\r' {
    955                            // Unget the newline char; the outer loop will
    956                            // reget it and adjust self.line_num appropriately.
    957                            self.unget_char();
    958                        }
    959                        self.string_error_token(
    960                            &mut token,
    961                            "unexpected escape sequence character after '\\'",
    962                        );
    963                        continue;
    964                    }
    965                }
    966            } else if c == b'\n' {
    967                self.line_num += 1;
    968                c
    969            } else if c == b'\r' {
    970                self.line_num += 1;
    971                if self.match_char(b'\n') {
    972                    str_buf.push(b'\r');
    973                    b'\n'
    974                } else {
    975                    c
    976                }
    977            } else if c == EOF {
    978                self.string_error_token(&mut token, "unterminated string literal");
    979                break;
    980            } else {
    981                // This case is only hit for the non-closing quote char.
    982                debug_assert!((c == b'\'' || c == b'\"') && c != quote_char);
    983                c
    984            };
    985            str_buf.push(c2);
    986        }
    987        str_buf.push(b'\0');
    988 
    989        token
    990    }
    991 }
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE