tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

Token.h (6880B)


      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
      2 * vim: set ts=8 sts=2 et sw=2 tw=80:
      3 * This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 /*
      8 * Token-affiliated data structures except for TokenKind (defined in its own
      9 * header).
     10 */
     11 
     12 #ifndef frontend_Token_h
     13 #define frontend_Token_h
     14 
     15 #include "mozilla/Assertions.h"  // MOZ_ASSERT
     16 
     17 #include <compare>   // std::strong_ordering
     18 #include <stdint.h>  // uint32_t
     19 
     20 #include "frontend/ParserAtom.h"  // TaggedParserAtomIndex, TrivialTaggedParserAtomIndex
     21 #include "frontend/TokenKind.h"  // js::frontend::TokenKind
     22 #include "js/RegExpFlags.h"      // JS::RegExpFlags
     23 
     24 namespace js {
     25 
     26 namespace frontend {
     27 
     28 struct TokenPos {
     29  uint32_t begin = 0;  // Offset of the token's first code unit.
     30  uint32_t end = 0;    // Offset of 1 past the token's last code unit.
     31 
     32  TokenPos() = default;
     33  TokenPos(uint32_t begin, uint32_t end) : begin(begin), end(end) {}
     34 
     35  // Return a TokenPos that covers left, right, and anything in between.
     36  static TokenPos box(const TokenPos& left, const TokenPos& right) {
     37    MOZ_ASSERT(left.begin <= left.end);
     38    MOZ_ASSERT(left.end <= right.begin);
     39    MOZ_ASSERT(right.begin <= right.end);
     40    return TokenPos(left.begin, right.end);
     41  }
     42 
     43  constexpr bool operator==(const TokenPos& bpos) const = default;
     44 
     45  constexpr auto operator<=>(const TokenPos& bpos) const {
     46    return begin <=> bpos.begin;
     47  }
     48 
     49  bool encloses(const TokenPos& pos) const {
     50    return begin <= pos.begin && pos.end <= end;
     51  }
     52 };
     53 
     54 enum DecimalPoint { NoDecimal = false, HasDecimal = true };
     55 
     56 // The only escapes found in IdentifierName are of the Unicode flavor.
     57 enum class IdentifierEscapes { None, SawUnicodeEscape };
     58 
     59 enum class NameVisibility { Public, Private };
     60 
     61 class TokenStreamShared;
     62 
     63 struct Token {
     64 private:
     65  // The lexical grammar of JavaScript has a quirk around the '/' character.
     66  // As the spec puts it:
     67  //
     68  // > There are several situations where the identification of lexical input
     69  // > elements is sensitive to the syntactic grammar context that is consuming
     70  // > the input elements. This requires multiple goal symbols for the lexical
     71  // > grammar. [...] The InputElementRegExp goal symbol is used in all
     72  // > syntactic grammar contexts where a RegularExpressionLiteral is permitted
     73  // > [...]  In all other contexts, InputElementDiv is used as the lexical
     74  // > goal symbol.
     75  //
     76  // https://tc39.github.io/ecma262/#sec-lexical-and-regexp-grammars
     77  //
     78  // What "sensitive to the syntactic grammar context" means is, the parser has
     79  // to tell the TokenStream whether to interpret '/' as division or
     80  // RegExp. Because only one or the other (or neither) will be legal at that
     81  // point in the program, and only the parser knows which one.
     82  //
     83  // But there's a problem: the parser often gets a token, puts it back, then
     84  // consumes it later; or (equivalently) peeks at a token, leaves it, peeks
     85  // again later, then finally consumes it. Of course we don't actually re-scan
     86  // the token every time; we cache it in the TokenStream. This leads to the
     87  // following rule:
     88  //
     89  // The parser must not pass SlashIsRegExp when getting/peeking at a token
     90  // previously scanned with SlashIsDiv; or vice versa.
     91  //
     92  // That way, code that asks for a SlashIsRegExp mode will never get a cached
     93  // Div token. But this rule is easy to screw up, because tokens are so often
     94  // peeked at on Parser.cpp line A and consumed on line B, where |A-B| is
     95  // thousands of lines. We therefore enforce it with the frontend's most
     96  // annoying assertion (in verifyConsistentModifier), and provide
     97  // Modifier::SlashIsInvalid to help avoid tripping it.
     98  //
     99  // This enum belongs in TokenStream, but C++, so we define it here and
    100  // typedef it there.
    101  enum Modifier {
    102    // Parse `/` and `/=` as the division operators. (That is, use
    103    // InputElementDiv as the goal symbol.)
    104    SlashIsDiv,
    105 
    106    // Parse `/` as the beginning of a RegExp literal. (That is, use
    107    // InputElementRegExp.)
    108    SlashIsRegExp,
    109 
    110    // Neither a Div token nor a RegExp token is syntactically valid here. When
    111    // the parser calls `getToken(SlashIsInvalid)`, it must be prepared to see
    112    // either one (and throw a SyntaxError either way).
    113    //
    114    // It's OK to use SlashIsInvalid to get a token that was originally scanned
    115    // with SlashIsDiv or SlashIsRegExp. The reverse--peeking with
    116    // SlashIsInvalid, then getting with another mode--is not OK. If either Div
    117    // or RegExp is syntactically valid here, use the appropriate modifier.
    118    SlashIsInvalid,
    119  };
    120  friend class TokenStreamShared;
    121 
    122 public:
    123  /** The type of this token. */
    124  TokenKind type;
    125 
    126  /** The token's position in the overall script. */
    127  TokenPos pos;
    128 
    129  union U {
    130   private:
    131    friend struct Token;
    132 
    133    TrivialTaggedParserAtomIndex atom;
    134 
    135    struct {
    136      /** Numeric literal's value. */
    137      double value;
    138 
    139      /** Does the numeric literal contain a '.'? */
    140      DecimalPoint decimalPoint;
    141    } number;
    142 
    143    /** Regular expression flags; use charBuffer to access source chars. */
    144    JS::RegExpFlags reflags;
    145 
    146   public:
    147    U() {};
    148  } u;
    149 
    150 #ifdef DEBUG
    151  /** The modifier used to get this token. */
    152  Modifier modifier;
    153 #endif
    154 
    155  // Mutators
    156 
    157  void setName(TaggedParserAtomIndex name) {
    158    MOZ_ASSERT(type == TokenKind::Name || type == TokenKind::PrivateName);
    159    u.atom = TrivialTaggedParserAtomIndex::from(name);
    160  }
    161 
    162  void setAtom(TaggedParserAtomIndex atom) {
    163    MOZ_ASSERT(type == TokenKind::String || type == TokenKind::TemplateHead ||
    164               type == TokenKind::NoSubsTemplate);
    165    u.atom = TrivialTaggedParserAtomIndex::from(atom);
    166  }
    167 
    168  void setRegExpFlags(JS::RegExpFlags flags) {
    169    MOZ_ASSERT(type == TokenKind::RegExp);
    170    u.reflags = flags;
    171  }
    172 
    173  void setNumber(double n, DecimalPoint decimalPoint) {
    174    MOZ_ASSERT(type == TokenKind::Number);
    175    u.number.value = n;
    176    u.number.decimalPoint = decimalPoint;
    177  }
    178 
    179  // Type-safe accessors
    180 
    181  TaggedParserAtomIndex name() const {
    182    MOZ_ASSERT(type == TokenKind::Name || type == TokenKind::PrivateName);
    183    return u.atom;
    184  }
    185 
    186  TaggedParserAtomIndex atom() const {
    187    MOZ_ASSERT(type == TokenKind::String || type == TokenKind::TemplateHead ||
    188               type == TokenKind::NoSubsTemplate);
    189    return u.atom;
    190  }
    191 
    192  JS::RegExpFlags regExpFlags() const {
    193    MOZ_ASSERT(type == TokenKind::RegExp);
    194    return u.reflags;
    195  }
    196 
    197  double number() const {
    198    MOZ_ASSERT(type == TokenKind::Number);
    199    return u.number.value;
    200  }
    201 
    202  DecimalPoint decimalPoint() const {
    203    MOZ_ASSERT(type == TokenKind::Number);
    204    return u.number.decimalPoint;
    205  }
    206 };
    207 
    208 }  // namespace frontend
    209 
    210 }  // namespace js
    211 
    212 #endif  // frontend_Token_h