[ tor-browser ].git.dasho

TokenStream.cpp (123048B)
      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
      2 * vim: set ts=8 sts=2 et sw=2 tw=80:
      3 * This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 // JS lexical scanner.
      8 
      9 #include "frontend/TokenStream.h"
     10 
     11 #include "mozilla/ArrayUtils.h"
     12 #include "mozilla/Attributes.h"
     13 #include "mozilla/Likely.h"
     14 #include "mozilla/Maybe.h"
     15 #include "mozilla/MemoryChecking.h"
     16 #include "mozilla/ScopeExit.h"
     17 #include "mozilla/Span.h"
     18 #include "mozilla/TextUtils.h"
     19 #include "mozilla/Utf8.h"
     20 
     21 #include <algorithm>
     22 #include <iterator>
     23 #include <limits>
     24 #include <stdarg.h>
     25 #include <stdint.h>
     26 #include <stdio.h>
     27 #include <type_traits>
     28 #include <utility>
     29 
     30 #include "jsnum.h"
     31 
     32 #include "frontend/FrontendContext.h"
     33 #include "frontend/Parser.h"
     34 #include "frontend/ParserAtom.h"
     35 #include "frontend/ReservedWords.h"
     36 #include "js/CharacterEncoding.h"  // JS::ConstUTF8CharsZ
     37 #include "js/ColumnNumber.h"  // JS::LimitedColumnNumberOneOrigin, JS::ColumnNumberOneOrigin, JS::TaggedColumnNumberOneOrigin
     38 #include "js/ErrorReport.h"   // JSErrorBase
     39 #include "js/friend/ErrorMessages.h"  // js::GetErrorMessage, JSMSG_*
     40 #include "js/Printf.h"                // JS_smprintf
     41 #include "js/RegExpFlags.h"           // JS::RegExpFlags
     42 #include "js/UniquePtr.h"
     43 #include "util/Text.h"
     44 #include "util/Unicode.h"
     45 #include "vm/FrameIter.h"  // js::{,NonBuiltin}FrameIter
     46 #include "vm/JSContext.h"
     47 #include "vm/Realm.h"
     48 
     49 using mozilla::AsciiAlphanumericToNumber;
     50 using mozilla::AssertedCast;
     51 using mozilla::DecodeOneUtf8CodePoint;
     52 using mozilla::IsAscii;
     53 using mozilla::IsAsciiAlpha;
     54 using mozilla::IsAsciiDigit;
     55 using mozilla::IsAsciiHexDigit;
     56 using mozilla::IsTrailingUnit;
     57 using mozilla::MakeScopeExit;
     58 using mozilla::Maybe;
     59 using mozilla::PointerRangeSize;
     60 using mozilla::Span;
     61 using mozilla::Utf8Unit;
     62 
     63 using JS::ReadOnlyCompileOptions;
     64 using JS::RegExpFlag;
     65 using JS::RegExpFlags;
     66 
     67 struct ReservedWordInfo {
     68  const char* chars;  // C string with reserved word text
     69  js::frontend::TokenKind tokentype;
     70 };
     71 
     72 static const ReservedWordInfo reservedWords[] = {
     73 #define RESERVED_WORD_INFO(word, name, type) {#word, js::frontend::type},
     74    FOR_EACH_JAVASCRIPT_RESERVED_WORD(RESERVED_WORD_INFO)
     75 #undef RESERVED_WORD_INFO
     76 };
     77 
     78 enum class ReservedWordsIndex : size_t {
     79 #define ENTRY_(_1, NAME, _3) NAME,
     80  FOR_EACH_JAVASCRIPT_RESERVED_WORD(ENTRY_)
     81 #undef ENTRY_
     82 };
     83 
     84 // Returns a ReservedWordInfo for the specified characters, or nullptr if the
     85 // string is not a reserved word.
     86 template <typename CharT>
     87 static const ReservedWordInfo* FindReservedWord(const CharT* s, size_t length) {
     88  MOZ_ASSERT(length != 0);
     89 
     90  size_t i;
     91  const ReservedWordInfo* rw;
     92  const char* chars;
     93 
     94 #define JSRW_LENGTH() length
     95 #define JSRW_AT(column) s[column]
     96 #define JSRW_GOT_MATCH(index) \
     97  i = (index);                \
     98  goto got_match;
     99 #define JSRW_TEST_GUESS(index) \
    100  i = (index);                 \
    101  goto test_guess;
    102 #define JSRW_NO_MATCH() goto no_match;
    103 #include "frontend/ReservedWordsGenerated.h"
    104 #undef JSRW_NO_MATCH
    105 #undef JSRW_TEST_GUESS
    106 #undef JSRW_GOT_MATCH
    107 #undef JSRW_AT
    108 #undef JSRW_LENGTH
    109 
    110 got_match:
    111  return &reservedWords[i];
    112 
    113 test_guess:
    114  rw = &reservedWords[i];
    115  chars = rw->chars;
    116  do {
    117    if (*s++ != static_cast<unsigned char>(*chars++)) {
    118      goto no_match;
    119    }
    120  } while (--length != 0);
    121  return rw;
    122 
    123 no_match:
    124  return nullptr;
    125 }
    126 
    127 template <>
    128 MOZ_ALWAYS_INLINE const ReservedWordInfo* FindReservedWord<Utf8Unit>(
    129    const Utf8Unit* units, size_t length) {
    130  return FindReservedWord(Utf8AsUnsignedChars(units), length);
    131 }
    132 
    133 static const ReservedWordInfo* FindReservedWord(
    134    const js::frontend::TaggedParserAtomIndex atom) {
    135  switch (atom.rawData()) {
    136 #define CASE_(_1, NAME, _3)                                           \
    137  case js::frontend::TaggedParserAtomIndex::WellKnownRawData::NAME(): \
    138    return &reservedWords[size_t(ReservedWordsIndex::NAME)];
    139    FOR_EACH_JAVASCRIPT_RESERVED_WORD(CASE_)
    140 #undef CASE_
    141  }
    142 
    143  return nullptr;
    144 }
    145 
    146 template <typename CharT>
    147 static constexpr bool IsAsciiBinary(CharT c) {
    148  using UnsignedCharT = std::make_unsigned_t<CharT>;
    149  auto uc = static_cast<UnsignedCharT>(c);
    150  return uc == '0' || uc == '1';
    151 }
    152 
    153 template <typename CharT>
    154 static constexpr bool IsAsciiOctal(CharT c) {
    155  using UnsignedCharT = std::make_unsigned_t<CharT>;
    156  auto uc = static_cast<UnsignedCharT>(c);
    157  return '0' <= uc && uc <= '7';
    158 }
    159 
    160 template <typename CharT>
    161 static constexpr uint8_t AsciiOctalToNumber(CharT c) {
    162  using UnsignedCharT = std::make_unsigned_t<CharT>;
    163  auto uc = static_cast<UnsignedCharT>(c);
    164  return uc - '0';
    165 }
    166 
    167 namespace js {
    168 
    169 namespace frontend {
    170 
    171 bool IsKeyword(TaggedParserAtomIndex atom) {
    172  if (const ReservedWordInfo* rw = FindReservedWord(atom)) {
    173    return TokenKindIsKeyword(rw->tokentype);
    174  }
    175 
    176  return false;
    177 }
    178 
    179 TokenKind ReservedWordTokenKind(TaggedParserAtomIndex name) {
    180  if (const ReservedWordInfo* rw = FindReservedWord(name)) {
    181    return rw->tokentype;
    182  }
    183 
    184  return TokenKind::Limit;
    185 }
    186 
    187 const char* ReservedWordToCharZ(TaggedParserAtomIndex name) {
    188  if (const ReservedWordInfo* rw = FindReservedWord(name)) {
    189    return ReservedWordToCharZ(rw->tokentype);
    190  }
    191 
    192  return nullptr;
    193 }
    194 
    195 const char* ReservedWordToCharZ(TokenKind tt) {
    196  MOZ_ASSERT(tt != TokenKind::Name);
    197  switch (tt) {
    198 #define EMIT_CASE(word, name, type) \
    199  case type:                        \
    200    return #word;
    201    FOR_EACH_JAVASCRIPT_RESERVED_WORD(EMIT_CASE)
    202 #undef EMIT_CASE
    203    default:
    204      MOZ_ASSERT_UNREACHABLE("Not a reserved word PropertyName.");
    205  }
    206  return nullptr;
    207 }
    208 
    209 TaggedParserAtomIndex TokenStreamAnyChars::reservedWordToPropertyName(
    210    TokenKind tt) const {
    211  MOZ_ASSERT(tt != TokenKind::Name);
    212  switch (tt) {
    213 #define EMIT_CASE(word, name, type) \
    214  case type:                        \
    215    return TaggedParserAtomIndex::WellKnown::name();
    216    FOR_EACH_JAVASCRIPT_RESERVED_WORD(EMIT_CASE)
    217 #undef EMIT_CASE
    218    default:
    219      MOZ_ASSERT_UNREACHABLE("Not a reserved word TokenKind.");
    220  }
    221  return TaggedParserAtomIndex::null();
    222 }
    223 
    224 SourceCoords::SourceCoords(FrontendContext* fc, uint32_t initialLineNumber,
    225                           uint32_t initialOffset)
    226    : lineStartOffsets_(fc), initialLineNum_(initialLineNumber), lastIndex_(0) {
    227  // This is actually necessary!  Removing it causes compile errors on
    228  // GCC and clang.  You could try declaring this:
    229  //
    230  //   const uint32_t SourceCoords::MAX_PTR;
    231  //
    232  // which fixes the GCC/clang error, but causes bustage on Windows.  Sigh.
    233  //
    234  uint32_t maxPtr = MAX_PTR;
    235 
    236  // The first line begins at buffer offset |initialOffset|.  MAX_PTR is the
    237  // sentinel.  The appends cannot fail because |lineStartOffsets_| has
    238  // statically-allocated elements.
    239  MOZ_ASSERT(lineStartOffsets_.capacity() >= 2);
    240  MOZ_ALWAYS_TRUE(lineStartOffsets_.reserve(2));
    241  lineStartOffsets_.infallibleAppend(initialOffset);
    242  lineStartOffsets_.infallibleAppend(maxPtr);
    243 }
    244 
    245 MOZ_ALWAYS_INLINE bool SourceCoords::add(uint32_t lineNum,
    246                                         uint32_t lineStartOffset) {
    247  uint32_t index = indexFromLineNumber(lineNum);
    248  uint32_t sentinelIndex = lineStartOffsets_.length() - 1;
    249 
    250  MOZ_ASSERT(lineStartOffsets_[0] <= lineStartOffset);
    251  MOZ_ASSERT(lineStartOffsets_[sentinelIndex] == MAX_PTR);
    252 
    253  if (index == sentinelIndex) {
    254    // We haven't seen this newline before.  Update lineStartOffsets_
    255    // only if lineStartOffsets_.append succeeds, to keep sentinel.
    256    // Otherwise return false to tell TokenStream about OOM.
    257    uint32_t maxPtr = MAX_PTR;
    258    if (!lineStartOffsets_.append(maxPtr)) {
    259      static_assert(std::is_same_v<decltype(lineStartOffsets_.allocPolicy()),
    260                                   TempAllocPolicy&>,
    261                    "this function's caller depends on it reporting an "
    262                    "error on failure, as TempAllocPolicy ensures");
    263      return false;
    264    }
    265 
    266    lineStartOffsets_[index] = lineStartOffset;
    267  } else {
    268    // We have seen this newline before (and ungot it).  Do nothing (other
    269    // than checking it hasn't mysteriously changed).
    270    // This path can be executed after hitting OOM, so check index.
    271    MOZ_ASSERT_IF(index < sentinelIndex,
    272                  lineStartOffsets_[index] == lineStartOffset);
    273  }
    274  return true;
    275 }
    276 
    277 MOZ_ALWAYS_INLINE bool SourceCoords::fill(const SourceCoords& other) {
    278  MOZ_ASSERT(lineStartOffsets_[0] == other.lineStartOffsets_[0]);
    279  MOZ_ASSERT(lineStartOffsets_.back() == MAX_PTR);
    280  MOZ_ASSERT(other.lineStartOffsets_.back() == MAX_PTR);
    281 
    282  if (lineStartOffsets_.length() >= other.lineStartOffsets_.length()) {
    283    return true;
    284  }
    285 
    286  uint32_t sentinelIndex = lineStartOffsets_.length() - 1;
    287  lineStartOffsets_[sentinelIndex] = other.lineStartOffsets_[sentinelIndex];
    288 
    289  for (size_t i = sentinelIndex + 1; i < other.lineStartOffsets_.length();
    290       i++) {
    291    if (!lineStartOffsets_.append(other.lineStartOffsets_[i])) {
    292      return false;
    293    }
    294  }
    295  return true;
    296 }
    297 
    298 MOZ_ALWAYS_INLINE uint32_t
    299 SourceCoords::indexFromOffset(uint32_t offset) const {
    300  uint32_t iMin, iMax, iMid;
    301 
    302  if (lineStartOffsets_[lastIndex_] <= offset) {
    303    // If we reach here, offset is on a line the same as or higher than
    304    // last time.  Check first for the +0, +1, +2 cases, because they
    305    // typically cover 85--98% of cases.
    306    if (offset < lineStartOffsets_[lastIndex_ + 1]) {
    307      return lastIndex_;  // index is same as last time
    308    }
    309 
    310    // If we reach here, there must be at least one more entry (plus the
    311    // sentinel).  Try it.
    312    lastIndex_++;
    313    if (offset < lineStartOffsets_[lastIndex_ + 1]) {
    314      return lastIndex_;  // index is one higher than last time
    315    }
    316 
    317    // The same logic applies here.
    318    lastIndex_++;
    319    if (offset < lineStartOffsets_[lastIndex_ + 1]) {
    320      return lastIndex_;  // index is two higher than last time
    321    }
    322 
    323    // No luck.  Oh well, we have a better-than-default starting point for
    324    // the binary search.
    325    iMin = lastIndex_ + 1;
    326    MOZ_ASSERT(iMin <
    327               lineStartOffsets_.length() - 1);  // -1 due to the sentinel
    328 
    329  } else {
    330    iMin = 0;
    331  }
    332 
    333  // This is a binary search with deferred detection of equality, which was
    334  // marginally faster in this case than a standard binary search.
    335  // The -2 is because |lineStartOffsets_.length() - 1| is the sentinel, and we
    336  // want one before that.
    337  iMax = lineStartOffsets_.length() - 2;
    338  while (iMax > iMin) {
    339    iMid = iMin + (iMax - iMin) / 2;
    340    if (offset >= lineStartOffsets_[iMid + 1]) {
    341      iMin = iMid + 1;  // offset is above lineStartOffsets_[iMid]
    342    } else {
    343      iMax = iMid;  // offset is below or within lineStartOffsets_[iMid]
    344    }
    345  }
    346 
    347  MOZ_ASSERT(iMax == iMin);
    348  MOZ_ASSERT(lineStartOffsets_[iMin] <= offset);
    349  MOZ_ASSERT(offset < lineStartOffsets_[iMin + 1]);
    350 
    351  lastIndex_ = iMin;
    352  return iMin;
    353 }
    354 
    355 SourceCoords::LineToken SourceCoords::lineToken(uint32_t offset) const {
    356  return LineToken(indexFromOffset(offset), offset);
    357 }
    358 
    359 TokenStreamAnyChars::TokenStreamAnyChars(FrontendContext* fc,
    360                                         const ReadOnlyCompileOptions& options,
    361                                         StrictModeGetter* smg)
    362    : fc(fc),
    363      options_(options),
    364      strictModeGetter_(smg),
    365      filename_(options.filename()),
    366      longLineColumnInfo_(fc),
    367      srcCoords(fc, options.lineno, options.scriptSourceOffset),
    368      lineno(options.lineno),
    369      mutedErrors(options.mutedErrors()) {
    370  // |isExprEnding| was initially zeroed: overwrite the true entries here.
    371  isExprEnding[size_t(TokenKind::Comma)] = true;
    372  isExprEnding[size_t(TokenKind::Semi)] = true;
    373  isExprEnding[size_t(TokenKind::Colon)] = true;
    374  isExprEnding[size_t(TokenKind::RightParen)] = true;
    375  isExprEnding[size_t(TokenKind::RightBracket)] = true;
    376  isExprEnding[size_t(TokenKind::RightCurly)] = true;
    377 }
    378 
    379 template <typename Unit>
    380 TokenStreamCharsBase<Unit>::TokenStreamCharsBase(FrontendContext* fc,
    381                                                 ParserAtomsTable* parserAtoms,
    382                                                 const Unit* units,
    383                                                 size_t length,
    384                                                 size_t startOffset)
    385    : TokenStreamCharsShared(fc, parserAtoms),
    386      sourceUnits(units, length, startOffset) {}
    387 
    388 bool FillCharBufferFromSourceNormalizingAsciiLineBreaks(CharBuffer& charBuffer,
    389                                                        const char16_t* cur,
    390                                                        const char16_t* end) {
    391  MOZ_ASSERT(charBuffer.length() == 0);
    392 
    393  while (cur < end) {
    394    char16_t ch = *cur++;
    395    if (ch == '\r') {
    396      ch = '\n';
    397      if (cur < end && *cur == '\n') {
    398        cur++;
    399      }
    400    }
    401 
    402    if (!charBuffer.append(ch)) {
    403      return false;
    404    }
    405  }
    406 
    407  MOZ_ASSERT(cur == end);
    408  return true;
    409 }
    410 
    411 bool FillCharBufferFromSourceNormalizingAsciiLineBreaks(CharBuffer& charBuffer,
    412                                                        const Utf8Unit* cur,
    413                                                        const Utf8Unit* end) {
    414  MOZ_ASSERT(charBuffer.length() == 0);
    415 
    416  while (cur < end) {
    417    Utf8Unit unit = *cur++;
    418    if (MOZ_LIKELY(IsAscii(unit))) {
    419      char16_t ch = unit.toUint8();
    420      if (ch == '\r') {
    421        ch = '\n';
    422        if (cur < end && *cur == Utf8Unit('\n')) {
    423          cur++;
    424        }
    425      }
    426 
    427      if (!charBuffer.append(ch)) {
    428        return false;
    429      }
    430 
    431      continue;
    432    }
    433 
    434    Maybe<char32_t> ch = DecodeOneUtf8CodePoint(unit, &cur, end);
    435    MOZ_ASSERT(ch.isSome(),
    436               "provided source text should already have been validated");
    437 
    438    if (!AppendCodePointToCharBuffer(charBuffer, ch.value())) {
    439      return false;
    440    }
    441  }
    442 
    443  MOZ_ASSERT(cur == end);
    444  return true;
    445 }
    446 
    447 template <typename Unit, class AnyCharsAccess>
    448 TokenStreamSpecific<Unit, AnyCharsAccess>::TokenStreamSpecific(
    449    FrontendContext* fc, ParserAtomsTable* parserAtoms,
    450    const ReadOnlyCompileOptions& options, const Unit* units, size_t length)
    451    : TokenStreamChars<Unit, AnyCharsAccess>(fc, parserAtoms, units, length,
    452                                             options.scriptSourceOffset) {}
    453 
    454 bool TokenStreamAnyChars::checkOptions() {
    455  // Constrain starting columns to where they will saturate.
    456  if (options().column.oneOriginValue() >
    457      JS::LimitedColumnNumberOneOrigin::Limit) {
    458    reportErrorNoOffset(JSMSG_BAD_COLUMN_NUMBER);
    459    return false;
    460  }
    461 
    462  return true;
    463 }
    464 
    465 void TokenStreamAnyChars::reportErrorNoOffset(unsigned errorNumber, ...) const {
    466  va_list args;
    467  va_start(args, errorNumber);
    468 
    469  reportErrorNoOffsetVA(errorNumber, &args);
    470 
    471  va_end(args);
    472 }
    473 
    474 void TokenStreamAnyChars::reportErrorNoOffsetVA(unsigned errorNumber,
    475                                                va_list* args) const {
    476  ErrorMetadata metadata;
    477  computeErrorMetadataNoOffset(&metadata);
    478 
    479  ReportCompileErrorLatin1VA(fc, std::move(metadata), nullptr, errorNumber,
    480                             args);
    481 }
    482 
    483 [[nodiscard]] MOZ_ALWAYS_INLINE bool
    484 TokenStreamAnyChars::internalUpdateLineInfoForEOL(uint32_t lineStartOffset) {
    485  prevLinebase = linebase;
    486  linebase = lineStartOffset;
    487  lineno++;
    488 
    489  // On overflow, report error.
    490  if (MOZ_UNLIKELY(!lineno)) {
    491    reportErrorNoOffset(JSMSG_BAD_LINE_NUMBER);
    492    return false;
    493  }
    494 
    495  return srcCoords.add(lineno, linebase);
    496 }
    497 
    498 #ifdef DEBUG
    499 
    500 template <>
    501 inline void SourceUnits<char16_t>::assertNextCodePoint(
    502    const PeekedCodePoint<char16_t>& peeked) {
    503  char32_t c = peeked.codePoint();
    504  if (c < unicode::NonBMPMin) {
    505    MOZ_ASSERT(peeked.lengthInUnits() == 1);
    506    MOZ_ASSERT(ptr[0] == c);
    507  } else {
    508    MOZ_ASSERT(peeked.lengthInUnits() == 2);
    509    char16_t lead, trail;
    510    unicode::UTF16Encode(c, &lead, &trail);
    511    MOZ_ASSERT(ptr[0] == lead);
    512    MOZ_ASSERT(ptr[1] == trail);
    513  }
    514 }
    515 
    516 template <>
    517 inline void SourceUnits<Utf8Unit>::assertNextCodePoint(
    518    const PeekedCodePoint<Utf8Unit>& peeked) {
    519  char32_t c = peeked.codePoint();
    520 
    521  // This is all roughly indulgence of paranoia only for assertions, so the
    522  // reimplementation of UTF-8 encoding a code point is (we think) a virtue.
    523  uint8_t expectedUnits[4] = {};
    524  if (c < 0x80) {
    525    expectedUnits[0] = AssertedCast<uint8_t>(c);
    526  } else if (c < 0x800) {
    527    expectedUnits[0] = 0b1100'0000 | (c >> 6);
    528    expectedUnits[1] = 0b1000'0000 | (c & 0b11'1111);
    529  } else if (c < 0x10000) {
    530    expectedUnits[0] = 0b1110'0000 | (c >> 12);
    531    expectedUnits[1] = 0b1000'0000 | ((c >> 6) & 0b11'1111);
    532    expectedUnits[2] = 0b1000'0000 | (c & 0b11'1111);
    533  } else {
    534    expectedUnits[0] = 0b1111'0000 | (c >> 18);
    535    expectedUnits[1] = 0b1000'0000 | ((c >> 12) & 0b11'1111);
    536    expectedUnits[2] = 0b1000'0000 | ((c >> 6) & 0b11'1111);
    537    expectedUnits[3] = 0b1000'0000 | (c & 0b11'1111);
    538  }
    539 
    540  MOZ_ASSERT(peeked.lengthInUnits() <= 4);
    541  for (uint8_t i = 0; i < peeked.lengthInUnits(); i++) {
    542    MOZ_ASSERT(expectedUnits[i] == ptr[i].toUint8());
    543  }
    544 }
    545 
    546 #endif  // DEBUG
    547 
    548 static MOZ_ALWAYS_INLINE void RetractPointerToCodePointBoundary(
    549    const Utf8Unit** ptr, const Utf8Unit* limit) {
    550  MOZ_ASSERT(*ptr <= limit);
    551 
    552  // |limit| is a code point boundary.
    553  if (MOZ_UNLIKELY(*ptr == limit)) {
    554    return;
    555  }
    556 
    557  // Otherwise rewind past trailing units to the start of the code point.
    558 #ifdef DEBUG
    559  size_t retracted = 0;
    560 #endif
    561  while (MOZ_UNLIKELY(IsTrailingUnit((*ptr)[0]))) {
    562    --*ptr;
    563 #ifdef DEBUG
    564    retracted++;
    565 #endif
    566  }
    567 
    568  MOZ_ASSERT(retracted < 4,
    569             "the longest UTF-8 code point is four units, so this should never "
    570             "retract more than three units");
    571 }
    572 
    573 static MOZ_ALWAYS_INLINE void RetractPointerToCodePointBoundary(
    574    const char16_t** ptr, const char16_t* limit) {
    575  MOZ_ASSERT(*ptr <= limit);
    576 
    577  // |limit| is a code point boundary.
    578  if (MOZ_UNLIKELY(*ptr == limit)) {
    579    return;
    580  }
    581 
    582  // Otherwise the pointer must be retracted by one iff it splits a two-unit
    583  // code point.
    584  if (MOZ_UNLIKELY(unicode::IsTrailSurrogate((*ptr)[0]))) {
    585    // Outside test suites testing garbage WTF-16, it's basically guaranteed
    586    // here that |(*ptr)[-1] (*ptr)[0]| is a surrogate pair.
    587    if (MOZ_LIKELY(unicode::IsLeadSurrogate((*ptr)[-1]))) {
    588      --*ptr;
    589    }
    590  }
    591 }
    592 
    593 template <typename Unit>
    594 JS::ColumnNumberUnsignedOffset TokenStreamAnyChars::computeColumnOffset(
    595    const LineToken lineToken, const uint32_t offset,
    596    const SourceUnits<Unit>& sourceUnits) const {
    597  lineToken.assertConsistentOffset(offset);
    598 
    599  const uint32_t start = srcCoords.lineStart(lineToken);
    600  const uint32_t offsetInLine = offset - start;
    601 
    602  if constexpr (std::is_same_v<Unit, char16_t>) {
    603    // Column offset is in UTF-16 code units.
    604    return JS::ColumnNumberUnsignedOffset(offsetInLine);
    605  }
    606 
    607  return computeColumnOffsetForUTF8(lineToken, offset, start, offsetInLine,
    608                                    sourceUnits);
    609 }
    610 
    611 template <typename Unit>
    612 JS::ColumnNumberUnsignedOffset TokenStreamAnyChars::computeColumnOffsetForUTF8(
    613    const LineToken lineToken, const uint32_t offset, const uint32_t start,
    614    const uint32_t offsetInLine, const SourceUnits<Unit>& sourceUnits) const {
    615  const uint32_t line = lineNumber(lineToken);
    616 
    617  // Reset the previous offset/column number offset cache for this line, if the
    618  // previous lookup wasn't on this line.
    619  if (line != lineOfLastColumnComputation_) {
    620    lineOfLastColumnComputation_ = line;
    621    lastChunkVectorForLine_ = nullptr;
    622    lastOffsetOfComputedColumn_ = start;
    623    lastComputedColumnOffset_ = JS::ColumnNumberUnsignedOffset::zero();
    624  }
    625 
    626  // Compute and return the final column number offset from a partially
    627  // calculated offset/column number offset, using the last-cached
    628  // offset/column number offset if they're more optimal.
    629  auto OffsetFromPartial =
    630      [this, offset, &sourceUnits](
    631          uint32_t partialOffset,
    632          JS::ColumnNumberUnsignedOffset partialColumnOffset,
    633          UnitsType unitsType) {
    634        MOZ_ASSERT(partialOffset <= offset);
    635 
    636        // If the last lookup on this line was closer to |offset|, use it.
    637        if (partialOffset < this->lastOffsetOfComputedColumn_ &&
    638            this->lastOffsetOfComputedColumn_ <= offset) {
    639          partialOffset = this->lastOffsetOfComputedColumn_;
    640          partialColumnOffset = this->lastComputedColumnOffset_;
    641        }
    642 
    643        const Unit* begin = sourceUnits.codeUnitPtrAt(partialOffset);
    644        const Unit* end = sourceUnits.codeUnitPtrAt(offset);
    645 
    646        size_t offsetDelta =
    647            AssertedCast<uint32_t>(PointerRangeSize(begin, end));
    648        partialOffset += offsetDelta;
    649 
    650        if (unitsType == UnitsType::GuaranteedSingleUnit) {
    651          MOZ_ASSERT(unicode::CountUTF16CodeUnits(begin, end) == offsetDelta,
    652                     "guaranteed-single-units also guarantee pointer distance "
    653                     "equals UTF-16 code unit count");
    654          partialColumnOffset += JS::ColumnNumberUnsignedOffset(offsetDelta);
    655        } else {
    656          partialColumnOffset += JS::ColumnNumberUnsignedOffset(
    657              AssertedCast<uint32_t>(unicode::CountUTF16CodeUnits(begin, end)));
    658        }
    659 
    660        this->lastOffsetOfComputedColumn_ = partialOffset;
    661        this->lastComputedColumnOffset_ = partialColumnOffset;
    662        return partialColumnOffset;
    663      };
    664 
    665  // We won't add an entry to |longLineColumnInfo_| for lines where the maximum
    666  // column has offset less than this value.  The most common (non-minified)
    667  // long line length is likely 80ch, maybe 100ch, so we use that, rounded up to
    668  // the next power of two for efficient division/multiplication below.
    669  constexpr uint32_t ColumnChunkLength = mozilla::RoundUpPow2(100);
    670 
    671  // The index within any associated |Vector<ChunkInfo>| of |offset|'s chunk.
    672  const uint32_t chunkIndex = offsetInLine / ColumnChunkLength;
    673  if (chunkIndex == 0) {
    674    // We don't know from an |offset| in the zeroth chunk that this line is even
    675    // long.  First-chunk info is mostly useless, anyway -- we have |start|
    676    // already.  So if we have *easy* access to that zeroth chunk, use it --
    677    // otherwise just count pessimally.  (This will still benefit from caching
    678    // the last column/offset for computations for successive offsets, so it's
    679    // not *always* worst-case.)
    680    UnitsType unitsType;
    681    if (lastChunkVectorForLine_ && lastChunkVectorForLine_->length() > 0) {
    682      MOZ_ASSERT((*lastChunkVectorForLine_)[0].columnOffset() ==
    683                 JS::ColumnNumberUnsignedOffset::zero());
    684      unitsType = (*lastChunkVectorForLine_)[0].unitsType();
    685    } else {
    686      unitsType = UnitsType::PossiblyMultiUnit;
    687    }
    688 
    689    return OffsetFromPartial(start, JS::ColumnNumberUnsignedOffset::zero(),
    690                             unitsType);
    691  }
    692 
    693  // If this line has no chunk vector yet, insert one in the hash map.  (The
    694  // required index is allocated and filled further down.)
    695  if (!lastChunkVectorForLine_) {
    696    auto ptr = longLineColumnInfo_.lookupForAdd(line);
    697    if (!ptr) {
    698      // This could rehash and invalidate a cached vector pointer, but the outer
    699      // condition means we don't have a cached pointer.
    700      if (!longLineColumnInfo_.add(ptr, line, Vector<ChunkInfo>(fc))) {
    701        // In case of OOM, just count columns from the start of the line.
    702        fc->recoverFromOutOfMemory();
    703        return OffsetFromPartial(start, JS::ColumnNumberUnsignedOffset::zero(),
    704                                 UnitsType::PossiblyMultiUnit);
    705      }
    706    }
    707 
    708    // Note that adding elements to this vector won't invalidate this pointer.
    709    lastChunkVectorForLine_ = &ptr->value();
    710  }
    711 
    712  const Unit* const limit = sourceUnits.codeUnitPtrAt(offset);
    713 
    714  auto RetractedOffsetOfChunk = [
    715 #ifdef DEBUG
    716                                    this,
    717 #endif
    718                                    start, limit,
    719                                    &sourceUnits](uint32_t index) {
    720    MOZ_ASSERT(index < this->lastChunkVectorForLine_->length());
    721 
    722    uint32_t naiveOffset = start + index * ColumnChunkLength;
    723    const Unit* naivePtr = sourceUnits.codeUnitPtrAt(naiveOffset);
    724 
    725    const Unit* actualPtr = naivePtr;
    726    RetractPointerToCodePointBoundary(&actualPtr, limit);
    727 
    728 #ifdef DEBUG
    729    if ((*this->lastChunkVectorForLine_)[index].unitsType() ==
    730        UnitsType::GuaranteedSingleUnit) {
    731      MOZ_ASSERT(naivePtr == actualPtr, "miscomputed unitsType value");
    732    }
    733 #endif
    734 
    735    return naiveOffset - PointerRangeSize(actualPtr, naivePtr);
    736  };
    737 
    738  uint32_t partialOffset;
    739  JS::ColumnNumberUnsignedOffset partialColumnOffset;
    740  UnitsType unitsType;
    741 
    742  auto entriesLen = AssertedCast<uint32_t>(lastChunkVectorForLine_->length());
    743  if (chunkIndex < entriesLen) {
    744    // We've computed the chunk |offset| resides in.  Compute the column number
    745    // from the chunk.
    746    partialOffset = RetractedOffsetOfChunk(chunkIndex);
    747    partialColumnOffset = (*lastChunkVectorForLine_)[chunkIndex].columnOffset();
    748 
    749    // This is exact if |chunkIndex| isn't the last chunk.
    750    unitsType = (*lastChunkVectorForLine_)[chunkIndex].unitsType();
    751 
    752    // Otherwise the last chunk is pessimistically assumed to contain multi-unit
    753    // code points because we haven't fully examined its contents yet -- they
    754    // may not have been tokenized yet, they could contain encoding errors, or
    755    // they might not even exist.
    756    MOZ_ASSERT_IF(chunkIndex == entriesLen - 1,
    757                  (*lastChunkVectorForLine_)[chunkIndex].unitsType() ==
    758                      UnitsType::PossiblyMultiUnit);
    759  } else {
    760    // Extend the vector from its last entry or the start of the line.  (This is
    761    // also a suitable partial start point if we must recover from OOM.)
    762    if (entriesLen > 0) {
    763      partialOffset = RetractedOffsetOfChunk(entriesLen - 1);
    764      partialColumnOffset =
    765          (*lastChunkVectorForLine_)[entriesLen - 1].columnOffset();
    766    } else {
    767      partialOffset = start;
    768      partialColumnOffset = JS::ColumnNumberUnsignedOffset::zero();
    769    }
    770 
    771    if (!lastChunkVectorForLine_->reserve(chunkIndex + 1)) {
    772      // As earlier, just start from the greatest offset/column in case of OOM.
    773      fc->recoverFromOutOfMemory();
    774      return OffsetFromPartial(partialOffset, partialColumnOffset,
    775                               UnitsType::PossiblyMultiUnit);
    776    }
    777 
    778    // OOM is no longer possible now.  \o/
    779 
    780    // The vector always begins with the column of the line start, i.e. zero,
    781    // with chunk units pessimally assumed not single-unit.
    782    if (entriesLen == 0) {
    783      lastChunkVectorForLine_->infallibleAppend(
    784          ChunkInfo(JS::ColumnNumberUnsignedOffset::zero(),
    785                    UnitsType::PossiblyMultiUnit));
    786      entriesLen++;
    787    }
    788 
    789    do {
    790      const Unit* const begin = sourceUnits.codeUnitPtrAt(partialOffset);
    791      const Unit* chunkLimit = sourceUnits.codeUnitPtrAt(
    792          start + std::min(entriesLen++ * ColumnChunkLength, offsetInLine));
    793 
    794      MOZ_ASSERT(begin < chunkLimit);
    795      MOZ_ASSERT(chunkLimit <= limit);
    796 
    797      static_assert(
    798          ColumnChunkLength > SourceUnitTraits<Unit>::maxUnitsLength - 1,
    799          "any retraction below is assumed to never underflow to the "
    800          "preceding chunk, even for the longest code point");
    801 
    802      // Prior tokenizing ensured that [begin, limit) is validly encoded, and
    803      // |begin < chunkLimit|, so any retraction here can't underflow.
    804      RetractPointerToCodePointBoundary(&chunkLimit, limit);
    805 
    806      MOZ_ASSERT(begin < chunkLimit);
    807      MOZ_ASSERT(chunkLimit <= limit);
    808 
    809      size_t numUnits = PointerRangeSize(begin, chunkLimit);
    810      size_t numUTF16CodeUnits =
    811          unicode::CountUTF16CodeUnits(begin, chunkLimit);
    812 
    813      // If this chunk (which will become non-final at the end of the loop) is
    814      // all single-unit code points, annotate the chunk accordingly.
    815      if (numUnits == numUTF16CodeUnits) {
    816        lastChunkVectorForLine_->back().guaranteeSingleUnits();
    817      }
    818 
    819      partialOffset += numUnits;
    820      partialColumnOffset += JS::ColumnNumberUnsignedOffset(numUTF16CodeUnits);
    821 
    822      lastChunkVectorForLine_->infallibleEmplaceBack(
    823          partialColumnOffset, UnitsType::PossiblyMultiUnit);
    824    } while (entriesLen < chunkIndex + 1);
    825 
    826    // We're at a spot in the current final chunk, and final chunks never have
    827    // complete units information, so be pessimistic.
    828    unitsType = UnitsType::PossiblyMultiUnit;
    829  }
    830 
    831  return OffsetFromPartial(partialOffset, partialColumnOffset, unitsType);
    832 }
    833 
    834 template <typename Unit, class AnyCharsAccess>
    835 JS::LimitedColumnNumberOneOrigin
    836 GeneralTokenStreamChars<Unit, AnyCharsAccess>::computeColumn(
    837    LineToken lineToken, uint32_t offset) const {
    838  lineToken.assertConsistentOffset(offset);
    839 
    840  const TokenStreamAnyChars& anyChars = anyCharsAccess();
    841 
    842  JS::ColumnNumberUnsignedOffset columnOffset =
    843      anyChars.computeColumnOffset(lineToken, offset, this->sourceUnits);
    844 
    845  if (!lineToken.isFirstLine()) {
    846    return JS::LimitedColumnNumberOneOrigin::fromUnlimited(
    847        JS::ColumnNumberOneOrigin() + columnOffset);
    848  }
    849 
    850  if (1 + columnOffset.value() > JS::LimitedColumnNumberOneOrigin::Limit) {
    851    return JS::LimitedColumnNumberOneOrigin::limit();
    852  }
    853 
    854  return JS::LimitedColumnNumberOneOrigin::fromUnlimited(
    855      (anyChars.options_.column + columnOffset).oneOriginValue());
    856 }
    857 
    858 template <typename Unit, class AnyCharsAccess>
    859 void GeneralTokenStreamChars<Unit, AnyCharsAccess>::computeLineAndColumn(
    860    uint32_t offset, uint32_t* line,
    861    JS::LimitedColumnNumberOneOrigin* column) const {
    862  const TokenStreamAnyChars& anyChars = anyCharsAccess();
    863 
    864  auto lineToken = anyChars.lineToken(offset);
    865  *line = anyChars.lineNumber(lineToken);
    866  *column = computeColumn(lineToken, offset);
    867 }
    868 
    869 template <class AnyCharsAccess>
    870 MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::internalEncodingError(
    871    uint8_t relevantUnits, unsigned errorNumber, ...) {
    872  va_list args;
    873  va_start(args, errorNumber);
    874 
    875  do {
    876    size_t offset = this->sourceUnits.offset();
    877 
    878    ErrorMetadata err;
    879 
    880    TokenStreamAnyChars& anyChars = anyCharsAccess();
    881 
    882    bool canAddLineOfContext = fillExceptingContext(&err, offset);
    883    if (canAddLineOfContext) {
    884      if (!internalComputeLineOfContext(&err, offset)) {
    885        break;
    886      }
    887 
    888      // As this is an encoding error, the computed window-end must be
    889      // identical to the location of the error -- any further on and the
    890      // window would contain invalid Unicode.
    891      MOZ_ASSERT_IF(err.lineOfContext != nullptr,
    892                    err.lineLength == err.tokenOffset);
    893    }
    894 
    895    auto notes = MakeUnique<JSErrorNotes>();
    896    if (!notes) {
    897      ReportOutOfMemory(anyChars.fc);
    898      break;
    899    }
    900 
    901    // The largest encoding of a UTF-8 code point is 4 units.  (Encoding an
    902    // obsolete 5- or 6-byte code point will complain only about a bad lead
    903    // code unit.)
    904    constexpr size_t MaxWidth = sizeof("0xHH 0xHH 0xHH 0xHH");
    905 
    906    MOZ_ASSERT(relevantUnits > 0);
    907 
    908    char badUnitsStr[MaxWidth];
    909    char* ptr = badUnitsStr;
    910    while (relevantUnits > 0) {
    911      byteToString(this->sourceUnits.getCodeUnit().toUint8(), ptr);
    912      ptr[4] = ' ';
    913 
    914      ptr += 5;
    915      relevantUnits--;
    916    }
    917 
    918    ptr[-1] = '\0';
    919 
    920    uint32_t line;
    921    JS::LimitedColumnNumberOneOrigin column;
    922    computeLineAndColumn(offset, &line, &column);
    923 
    924    if (!notes->addNoteASCII(anyChars.fc, anyChars.getFilename().c_str(), 0,
    925                             line, JS::ColumnNumberOneOrigin(column),
    926                             GetErrorMessage, nullptr, JSMSG_BAD_CODE_UNITS,
    927                             badUnitsStr)) {
    928      break;
    929    }
    930 
    931    ReportCompileErrorLatin1VA(anyChars.fc, std::move(err), std::move(notes),
    932                               errorNumber, &args);
    933  } while (false);
    934 
    935  va_end(args);
    936 }
    937 
    938 template <class AnyCharsAccess>
    939 MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::badLeadUnit(
    940    Utf8Unit lead) {
    941  uint8_t leadValue = lead.toUint8();
    942 
    943  char leadByteStr[5];
    944  byteToTerminatedString(leadValue, leadByteStr);
    945 
    946  internalEncodingError(1, JSMSG_BAD_LEADING_UTF8_UNIT, leadByteStr);
    947 }
    948 
    949 template <class AnyCharsAccess>
    950 MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::notEnoughUnits(
    951    Utf8Unit lead, uint8_t remaining, uint8_t required) {
    952  uint8_t leadValue = lead.toUint8();
    953 
    954  MOZ_ASSERT(required == 2 || required == 3 || required == 4);
    955  MOZ_ASSERT(remaining < 4);
    956  MOZ_ASSERT(remaining < required);
    957 
    958  char leadByteStr[5];
    959  byteToTerminatedString(leadValue, leadByteStr);
    960 
    961  // |toHexChar| produces the desired decimal numbers for values < 4.
    962  const char expectedStr[] = {toHexChar(required - 1), '\0'};
    963  const char actualStr[] = {toHexChar(remaining - 1), '\0'};
    964 
    965  internalEncodingError(remaining, JSMSG_NOT_ENOUGH_CODE_UNITS, leadByteStr,
    966                        expectedStr, required == 2 ? "" : "s", actualStr,
    967                        remaining == 2 ? " was" : "s were");
    968 }
    969 
    970 template <class AnyCharsAccess>
    971 MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::badTrailingUnit(
    972    uint8_t unitsObserved) {
    973  Utf8Unit badUnit =
    974      this->sourceUnits.addressOfNextCodeUnit()[unitsObserved - 1];
    975 
    976  char badByteStr[5];
    977  byteToTerminatedString(badUnit.toUint8(), badByteStr);
    978 
    979  internalEncodingError(unitsObserved, JSMSG_BAD_TRAILING_UTF8_UNIT,
    980                        badByteStr);
    981 }
    982 
    983 template <class AnyCharsAccess>
    984 MOZ_COLD void
    985 TokenStreamChars<Utf8Unit, AnyCharsAccess>::badStructurallyValidCodePoint(
    986    char32_t codePoint, uint8_t codePointLength, const char* reason) {
    987  // Construct a string like "0x203D" (including null terminator) to include
    988  // in the error message.  Write the string end-to-start from end to start
    989  // of an adequately sized |char| array, shifting least significant nibbles
    990  // off the number and writing the corresponding hex digits until done, then
    991  // prefixing with "0x".  |codePointStr| points at the incrementally
    992  // computed string, within |codePointCharsArray|'s bounds.
    993 
    994  // 0x1F'FFFF is the maximum value that can fit in 3+6+6+6 unconstrained
    995  // bits in a four-byte UTF-8 code unit sequence.
    996  constexpr size_t MaxHexSize = sizeof(
    997      "0x1F"
    998      "FFFF");  // including '\0'
    999  char codePointCharsArray[MaxHexSize];
   1000 
   1001  char* codePointStr = std::end(codePointCharsArray);
   1002  *--codePointStr = '\0';
   1003 
   1004  // Note that by do-while looping here rather than while-looping, this
   1005  // writes a '0' when |codePoint == 0|.
   1006  do {
   1007    MOZ_ASSERT(codePointCharsArray < codePointStr);
   1008    *--codePointStr = toHexChar(codePoint & 0xF);
   1009    codePoint >>= 4;
   1010  } while (codePoint);
   1011 
   1012  MOZ_ASSERT(codePointCharsArray + 2 <= codePointStr);
   1013  *--codePointStr = 'x';
   1014  *--codePointStr = '0';
   1015 
   1016  internalEncodingError(codePointLength, JSMSG_FORBIDDEN_UTF8_CODE_POINT,
   1017                        codePointStr, reason);
   1018 }
   1019 
   1020 template <class AnyCharsAccess>
   1021 [[nodiscard]] bool
   1022 TokenStreamChars<Utf8Unit, AnyCharsAccess>::getNonAsciiCodePointDontNormalize(
   1023    Utf8Unit lead, char32_t* codePoint) {
   1024  auto onBadLeadUnit = [this, &lead]() { this->badLeadUnit(lead); };
   1025 
   1026  auto onNotEnoughUnits = [this, &lead](uint8_t remaining, uint8_t required) {
   1027    this->notEnoughUnits(lead, remaining, required);
   1028  };
   1029 
   1030  auto onBadTrailingUnit = [this](uint8_t unitsObserved) {
   1031    this->badTrailingUnit(unitsObserved);
   1032  };
   1033 
   1034  auto onBadCodePoint = [this](char32_t badCodePoint, uint8_t unitsObserved) {
   1035    this->badCodePoint(badCodePoint, unitsObserved);
   1036  };
   1037 
   1038  auto onNotShortestForm = [this](char32_t badCodePoint,
   1039                                  uint8_t unitsObserved) {
   1040    this->notShortestForm(badCodePoint, unitsObserved);
   1041  };
   1042 
   1043  // If a valid code point is decoded, this function call consumes its code
   1044  // units.  If not, it ungets the lead code unit and invokes the right error
   1045  // handler, so on failure we must immediately return false.
   1046  SourceUnitsIterator iter(this->sourceUnits);
   1047  Maybe<char32_t> maybeCodePoint = DecodeOneUtf8CodePointInline(
   1048      lead, &iter, SourceUnitsEnd(), onBadLeadUnit, onNotEnoughUnits,
   1049      onBadTrailingUnit, onBadCodePoint, onNotShortestForm);
   1050  if (maybeCodePoint.isNothing()) {
   1051    return false;
   1052  }
   1053 
   1054  *codePoint = maybeCodePoint.value();
   1055  return true;
   1056 }
   1057 
   1058 template <class AnyCharsAccess>
   1059 bool TokenStreamChars<char16_t, AnyCharsAccess>::getNonAsciiCodePoint(
   1060    int32_t lead, char32_t* codePoint) {
   1061  MOZ_ASSERT(lead != EOF);
   1062  MOZ_ASSERT(!isAsciiCodePoint(lead),
   1063             "ASCII code unit/point must be handled separately");
   1064  MOZ_ASSERT(lead == this->sourceUnits.previousCodeUnit(),
   1065             "getNonAsciiCodePoint called incorrectly");
   1066 
   1067  // The code point is usually |lead|: overwrite later if needed.
   1068  *codePoint = AssertedCast<char32_t>(lead);
   1069 
   1070  // ECMAScript specifically requires that unpaired UTF-16 surrogates be
   1071  // treated as the corresponding code point and not as an error.  See
   1072  // <https://tc39.github.io/ecma262/#sec-ecmascript-language-types-string-type>.
   1073  // Thus this function does not consider any sequence of 16-bit numbers to
   1074  // be intrinsically in error.
   1075 
   1076  // Dispense with single-unit code points and lone trailing surrogates.
   1077  if (MOZ_LIKELY(!unicode::IsLeadSurrogate(lead))) {
   1078    if (MOZ_UNLIKELY(lead == unicode::LINE_SEPARATOR ||
   1079                     lead == unicode::PARA_SEPARATOR)) {
   1080      if (!updateLineInfoForEOL()) {
   1081 #ifdef DEBUG
   1082        // Assign to a sentinel value to hopefully cause errors.
   1083        *codePoint = std::numeric_limits<char32_t>::max();
   1084 #endif
   1085        MOZ_MAKE_MEM_UNDEFINED(codePoint, sizeof(*codePoint));
   1086        return false;
   1087      }
   1088 
   1089      *codePoint = '\n';
   1090    } else {
   1091      MOZ_ASSERT(!IsLineTerminator(*codePoint));
   1092    }
   1093 
   1094    return true;
   1095  }
   1096 
   1097  // Also handle a lead surrogate not paired with a trailing surrogate.
   1098  if (MOZ_UNLIKELY(
   1099          this->sourceUnits.atEnd() ||
   1100          !unicode::IsTrailSurrogate(this->sourceUnits.peekCodeUnit()))) {
   1101    MOZ_ASSERT(!IsLineTerminator(*codePoint));
   1102    return true;
   1103  }
   1104 
   1105  // Otherwise we have a multi-unit code point.
   1106  *codePoint = unicode::UTF16Decode(lead, this->sourceUnits.getCodeUnit());
   1107  MOZ_ASSERT(!IsLineTerminator(*codePoint));
   1108  return true;
   1109 }
   1110 
   1111 template <class AnyCharsAccess>
   1112 bool TokenStreamChars<Utf8Unit, AnyCharsAccess>::getNonAsciiCodePoint(
   1113    int32_t unit, char32_t* codePoint) {
   1114  MOZ_ASSERT(unit != EOF);
   1115  MOZ_ASSERT(!isAsciiCodePoint(unit),
   1116             "ASCII code unit/point must be handled separately");
   1117 
   1118  Utf8Unit lead = Utf8Unit(static_cast<unsigned char>(unit));
   1119  MOZ_ASSERT(lead == this->sourceUnits.previousCodeUnit(),
   1120             "getNonAsciiCodePoint called incorrectly");
   1121 
   1122  auto onBadLeadUnit = [this, &lead]() { this->badLeadUnit(lead); };
   1123 
   1124  auto onNotEnoughUnits = [this, &lead](uint_fast8_t remaining,
   1125                                        uint_fast8_t required) {
   1126    this->notEnoughUnits(lead, remaining, required);
   1127  };
   1128 
   1129  auto onBadTrailingUnit = [this](uint_fast8_t unitsObserved) {
   1130    this->badTrailingUnit(unitsObserved);
   1131  };
   1132 
   1133  auto onBadCodePoint = [this](char32_t badCodePoint,
   1134                               uint_fast8_t unitsObserved) {
   1135    this->badCodePoint(badCodePoint, unitsObserved);
   1136  };
   1137 
   1138  auto onNotShortestForm = [this](char32_t badCodePoint,
   1139                                  uint_fast8_t unitsObserved) {
   1140    this->notShortestForm(badCodePoint, unitsObserved);
   1141  };
   1142 
   1143  // This consumes the full, valid code point or ungets |lead| and calls the
   1144  // appropriate error functor on failure.
   1145  SourceUnitsIterator iter(this->sourceUnits);
   1146  Maybe<char32_t> maybeCodePoint = DecodeOneUtf8CodePoint(
   1147      lead, &iter, SourceUnitsEnd(), onBadLeadUnit, onNotEnoughUnits,
   1148      onBadTrailingUnit, onBadCodePoint, onNotShortestForm);
   1149  if (maybeCodePoint.isNothing()) {
   1150    return false;
   1151  }
   1152 
   1153  char32_t cp = maybeCodePoint.value();
   1154  if (MOZ_UNLIKELY(cp == unicode::LINE_SEPARATOR ||
   1155                   cp == unicode::PARA_SEPARATOR)) {
   1156    if (!updateLineInfoForEOL()) {
   1157 #ifdef DEBUG
   1158      // Assign to a sentinel value to hopefully cause errors.
   1159      *codePoint = std::numeric_limits<char32_t>::max();
   1160 #endif
   1161      MOZ_MAKE_MEM_UNDEFINED(codePoint, sizeof(*codePoint));
   1162      return false;
   1163    }
   1164 
   1165    *codePoint = '\n';
   1166  } else {
   1167    MOZ_ASSERT(!IsLineTerminator(cp));
   1168    *codePoint = cp;
   1169  }
   1170 
   1171  return true;
   1172 }
   1173 
   1174 template <>
   1175 size_t SourceUnits<char16_t>::findWindowStart(size_t offset) const {
   1176  // This is JS's understanding of UTF-16 that allows lone surrogates, so
   1177  // we have to exclude lone surrogates from [windowStart, offset) ourselves.
   1178 
   1179  const char16_t* const earliestPossibleStart = codeUnitPtrAt(startOffset_);
   1180 
   1181  const char16_t* const initial = codeUnitPtrAt(offset);
   1182  const char16_t* p = initial;
   1183 
   1184  auto HalfWindowSize = [&p, &initial]() {
   1185    return PointerRangeSize(p, initial);
   1186  };
   1187 
   1188  while (true) {
   1189    MOZ_ASSERT(earliestPossibleStart <= p);
   1190    MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
   1191    if (p <= earliestPossibleStart || HalfWindowSize() >= WindowRadius) {
   1192      break;
   1193    }
   1194 
   1195    char16_t c = p[-1];
   1196 
   1197    // This stops at U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR in
   1198    // string and template literals.  These code points do affect line and
   1199    // column coordinates, even as they encode their literal values.
   1200    if (IsLineTerminator(c)) {
   1201      break;
   1202    }
   1203 
   1204    // Don't allow invalid UTF-16 in pre-context.  (Current users don't
   1205    // require this, and this behavior isn't currently imposed on
   1206    // pre-context, but these facts might change someday.)
   1207 
   1208    if (MOZ_UNLIKELY(unicode::IsLeadSurrogate(c))) {
   1209      break;
   1210    }
   1211 
   1212    // Optimistically include the code unit, reverting below if needed.
   1213    p--;
   1214 
   1215    // If it's not a surrogate at all, keep going.
   1216    if (MOZ_LIKELY(!unicode::IsTrailSurrogate(c))) {
   1217      continue;
   1218    }
   1219 
   1220    // Stop if we don't have a usable surrogate pair.
   1221    if (HalfWindowSize() >= WindowRadius ||
   1222        p <= earliestPossibleStart ||      // trail surrogate at low end
   1223        !unicode::IsLeadSurrogate(p[-1]))  // no paired lead surrogate
   1224    {
   1225      p++;
   1226      break;
   1227    }
   1228 
   1229    p--;
   1230  }
   1231 
   1232  MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
   1233  return offset - HalfWindowSize();
   1234 }
   1235 
   1236 template <>
   1237 size_t SourceUnits<Utf8Unit>::findWindowStart(size_t offset) const {
   1238  // |offset| must be the location of the error or somewhere before it, so we
   1239  // know preceding data is valid UTF-8.
   1240 
   1241  const Utf8Unit* const earliestPossibleStart = codeUnitPtrAt(startOffset_);
   1242 
   1243  const Utf8Unit* const initial = codeUnitPtrAt(offset);
   1244  const Utf8Unit* p = initial;
   1245 
   1246  auto HalfWindowSize = [&p, &initial]() {
   1247    return PointerRangeSize(p, initial);
   1248  };
   1249 
   1250  while (true) {
   1251    MOZ_ASSERT(earliestPossibleStart <= p);
   1252    MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
   1253    if (p <= earliestPossibleStart || HalfWindowSize() >= WindowRadius) {
   1254      break;
   1255    }
   1256 
   1257    // Peek backward for a line break, and only decrement if there is none.
   1258    uint8_t prev = p[-1].toUint8();
   1259 
   1260    // First check for the ASCII LineTerminators.
   1261    if (prev == '\r' || prev == '\n') {
   1262      break;
   1263    }
   1264 
   1265    // Now check for the non-ASCII LineTerminators U+2028 LINE SEPARATOR
   1266    // (0xE2 0x80 0xA8) and U+2029 PARAGRAPH (0xE2 0x80 0xA9).  If there
   1267    // aren't three code units available, some comparison here will fail
   1268    // before we'd underflow.
   1269    if (MOZ_UNLIKELY((prev == 0xA8 || prev == 0xA9) &&
   1270                     p[-2].toUint8() == 0x80 && p[-3].toUint8() == 0xE2)) {
   1271      break;
   1272    }
   1273 
   1274    // Rewind over the non-LineTerminator.  This can't underflow
   1275    // |earliestPossibleStart| because it begins a code point.
   1276    while (IsTrailingUnit(*--p)) {
   1277      continue;
   1278    }
   1279 
   1280    MOZ_ASSERT(earliestPossibleStart <= p);
   1281 
   1282    // But if we underflowed |WindowRadius|, adjust forward and stop.
   1283    if (HalfWindowSize() > WindowRadius) {
   1284      static_assert(WindowRadius > 3,
   1285                    "skipping over non-lead code units below must not "
   1286                    "advance past |offset|");
   1287 
   1288      while (IsTrailingUnit(*++p)) {
   1289        continue;
   1290      }
   1291 
   1292      MOZ_ASSERT(HalfWindowSize() < WindowRadius);
   1293      break;
   1294    }
   1295  }
   1296 
   1297  MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
   1298  return offset - HalfWindowSize();
   1299 }
   1300 
   1301 template <>
   1302 size_t SourceUnits<char16_t>::findWindowEnd(size_t offset) const {
   1303  const char16_t* const initial = codeUnitPtrAt(offset);
   1304  const char16_t* p = initial;
   1305 
   1306  auto HalfWindowSize = [&initial, &p]() {
   1307    return PointerRangeSize(initial, p);
   1308  };
   1309 
   1310  while (true) {
   1311    MOZ_ASSERT(p <= limit_);
   1312    MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
   1313    if (p >= limit_ || HalfWindowSize() >= WindowRadius) {
   1314      break;
   1315    }
   1316 
   1317    char16_t c = *p;
   1318 
   1319    // This stops at U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR in
   1320    // string and template literals.  These code points do affect line and
   1321    // column coordinates, even as they encode their literal values.
   1322    if (IsLineTerminator(c)) {
   1323      break;
   1324    }
   1325 
   1326    // Don't allow invalid UTF-16 in post-context.  (Current users don't
   1327    // require this, and this behavior isn't currently imposed on
   1328    // pre-context, but these facts might change someday.)
   1329 
   1330    if (MOZ_UNLIKELY(unicode::IsTrailSurrogate(c))) {
   1331      break;
   1332    }
   1333 
   1334    // Optimistically consume the code unit, ungetting it below if needed.
   1335    p++;
   1336 
   1337    // If it's not a surrogate at all, keep going.
   1338    if (MOZ_LIKELY(!unicode::IsLeadSurrogate(c))) {
   1339      continue;
   1340    }
   1341 
   1342    // Retract if the lead surrogate would stand alone at the end of the
   1343    // window.
   1344    if (HalfWindowSize() >= WindowRadius ||  // split pair
   1345        p >= limit_ ||                       // half-pair at end of source
   1346        !unicode::IsTrailSurrogate(*p))      // no paired trail surrogate
   1347    {
   1348      p--;
   1349      break;
   1350    }
   1351 
   1352    p++;
   1353  }
   1354 
   1355  return offset + HalfWindowSize();
   1356 }
   1357 
   1358 template <>
   1359 size_t SourceUnits<Utf8Unit>::findWindowEnd(size_t offset) const {
   1360  const Utf8Unit* const initial = codeUnitPtrAt(offset);
   1361  const Utf8Unit* p = initial;
   1362 
   1363  auto HalfWindowSize = [&initial, &p]() {
   1364    return PointerRangeSize(initial, p);
   1365  };
   1366 
   1367  while (true) {
   1368    MOZ_ASSERT(p <= limit_);
   1369    MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
   1370    if (p >= limit_ || HalfWindowSize() >= WindowRadius) {
   1371      break;
   1372    }
   1373 
   1374    // A non-encoding error might be followed by an encoding error within
   1375    // |maxEnd|, so we must validate as we go to not include invalid UTF-8
   1376    // in the computed window.  What joy!
   1377 
   1378    Utf8Unit lead = *p;
   1379    if (mozilla::IsAscii(lead)) {
   1380      if (IsSingleUnitLineTerminator(lead)) {
   1381        break;
   1382      }
   1383 
   1384      p++;
   1385      continue;
   1386    }
   1387 
   1388    PeekedCodePoint<Utf8Unit> peeked = PeekCodePoint(p, limit_);
   1389    if (peeked.isNone()) {
   1390      break;  // encoding error
   1391    }
   1392 
   1393    char32_t c = peeked.codePoint();
   1394    if (MOZ_UNLIKELY(c == unicode::LINE_SEPARATOR ||
   1395                     c == unicode::PARA_SEPARATOR)) {
   1396      break;
   1397    }
   1398 
   1399    MOZ_ASSERT(!IsLineTerminator(c));
   1400 
   1401    uint8_t len = peeked.lengthInUnits();
   1402    if (HalfWindowSize() + len > WindowRadius) {
   1403      break;
   1404    }
   1405 
   1406    p += len;
   1407  }
   1408 
   1409  MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
   1410  return offset + HalfWindowSize();
   1411 }
   1412 
   1413 template <typename Unit, class AnyCharsAccess>
   1414 bool TokenStreamSpecific<Unit, AnyCharsAccess>::advance(size_t position) {
   1415  const Unit* end = this->sourceUnits.codeUnitPtrAt(position);
   1416  while (this->sourceUnits.addressOfNextCodeUnit() < end) {
   1417    if (!getCodePoint()) {
   1418      return false;
   1419    }
   1420  }
   1421 
   1422  TokenStreamAnyChars& anyChars = anyCharsAccess();
   1423  Token* cur = const_cast<Token*>(&anyChars.currentToken());
   1424  cur->pos.begin = this->sourceUnits.offset();
   1425  cur->pos.end = cur->pos.begin;
   1426 #ifdef DEBUG
   1427  cur->type = TokenKind::Limit;
   1428 #endif
   1429  MOZ_MAKE_MEM_UNDEFINED(&cur->type, sizeof(cur->type));
   1430  anyChars.lookahead = 0;
   1431  return true;
   1432 }
   1433 
   1434 template <typename Unit, class AnyCharsAccess>
   1435 void TokenStreamSpecific<Unit, AnyCharsAccess>::seekTo(const Position& pos) {
   1436  TokenStreamAnyChars& anyChars = anyCharsAccess();
   1437 
   1438  this->sourceUnits.setAddressOfNextCodeUnit(pos.buf,
   1439                                             /* allowPoisoned = */ true);
   1440  anyChars.flags = pos.flags;
   1441  anyChars.lineno = pos.lineno;
   1442  anyChars.linebase = pos.linebase;
   1443  anyChars.prevLinebase = pos.prevLinebase;
   1444  anyChars.lookahead = pos.lookahead;
   1445 
   1446  anyChars.tokens[anyChars.cursor()] = pos.currentToken;
   1447  for (unsigned i = 0; i < anyChars.lookahead; i++) {
   1448    anyChars.tokens[anyChars.aheadCursor(1 + i)] = pos.lookaheadTokens[i];
   1449  }
   1450 }
   1451 
   1452 template <typename Unit, class AnyCharsAccess>
   1453 bool TokenStreamSpecific<Unit, AnyCharsAccess>::seekTo(
   1454    const Position& pos, const TokenStreamAnyChars& other) {
   1455  if (!anyCharsAccess().srcCoords.fill(other.srcCoords)) {
   1456    return false;
   1457  }
   1458 
   1459  seekTo(pos);
   1460  return true;
   1461 }
   1462 
   1463 void TokenStreamAnyChars::computeErrorMetadataNoOffset(
   1464    ErrorMetadata* err) const {
   1465  err->isMuted = mutedErrors;
   1466  err->filename = filename_;
   1467  err->lineNumber = 0;
   1468  err->columnNumber = JS::ColumnNumberOneOrigin();
   1469 
   1470  MOZ_ASSERT(err->lineOfContext == nullptr);
   1471 }
   1472 
   1473 bool TokenStreamAnyChars::fillExceptingContext(ErrorMetadata* err,
   1474                                               uint32_t offset) const {
   1475  err->isMuted = mutedErrors;
   1476 
   1477  // If this TokenStreamAnyChars doesn't have location information, try to
   1478  // get it from the caller.
   1479  if (!filename_) {
   1480    JSContext* maybeCx = context()->maybeCurrentJSContext();
   1481    if (maybeCx) {
   1482      NonBuiltinFrameIter iter(maybeCx,
   1483                               FrameIter::FOLLOW_DEBUGGER_EVAL_PREV_LINK,
   1484                               maybeCx->realm()->principals());
   1485      if (!iter.done() && iter.filename()) {
   1486        err->filename = JS::ConstUTF8CharsZ(iter.filename());
   1487        JS::TaggedColumnNumberOneOrigin columnNumber;
   1488        err->lineNumber = iter.computeLine(&columnNumber);
   1489        err->columnNumber =
   1490            JS::ColumnNumberOneOrigin(columnNumber.oneOriginValue());
   1491        return false;
   1492      }
   1493    }
   1494  }
   1495 
   1496  // Otherwise use this TokenStreamAnyChars's location information.
   1497  err->filename = filename_;
   1498  return true;
   1499 }
   1500 
   1501 template <>
   1502 inline void SourceUnits<char16_t>::computeWindowOffsetAndLength(
   1503    const char16_t* encodedWindow, size_t encodedTokenOffset,
   1504    size_t* utf16TokenOffset, size_t encodedWindowLength,
   1505    size_t* utf16WindowLength) const {
   1506  MOZ_ASSERT_UNREACHABLE("shouldn't need to recompute for UTF-16");
   1507 }
   1508 
   1509 template <>
   1510 inline void SourceUnits<Utf8Unit>::computeWindowOffsetAndLength(
   1511    const Utf8Unit* encodedWindow, size_t encodedTokenOffset,
   1512    size_t* utf16TokenOffset, size_t encodedWindowLength,
   1513    size_t* utf16WindowLength) const {
   1514  MOZ_ASSERT(encodedTokenOffset <= encodedWindowLength,
   1515             "token offset must be within the window, and the two lambda "
   1516             "calls below presume this ordering of values");
   1517 
   1518  const Utf8Unit* const encodedWindowEnd = encodedWindow + encodedWindowLength;
   1519 
   1520  size_t i = 0;
   1521  auto ComputeUtf16Count = [&i, &encodedWindow](const Utf8Unit* limit) {
   1522    while (encodedWindow < limit) {
   1523      Utf8Unit lead = *encodedWindow++;
   1524      if (MOZ_LIKELY(IsAscii(lead))) {
   1525        // ASCII contributes a single UTF-16 code unit.
   1526        i++;
   1527        continue;
   1528      }
   1529 
   1530      Maybe<char32_t> cp = DecodeOneUtf8CodePoint(lead, &encodedWindow, limit);
   1531      MOZ_ASSERT(cp.isSome(),
   1532                 "computed window should only contain valid UTF-8");
   1533 
   1534      i += unicode::IsSupplementary(cp.value()) ? 2 : 1;
   1535    }
   1536 
   1537    return i;
   1538  };
   1539 
   1540  // Compute the token offset from |i == 0| and the initial |encodedWindow|.
   1541  const Utf8Unit* token = encodedWindow + encodedTokenOffset;
   1542  MOZ_ASSERT(token <= encodedWindowEnd);
   1543  *utf16TokenOffset = ComputeUtf16Count(token);
   1544 
   1545  // Compute the window length, picking up from |i| and |encodedWindow| that,
   1546  // in general, were modified just above.
   1547  *utf16WindowLength = ComputeUtf16Count(encodedWindowEnd);
   1548 }
   1549 
   1550 template <typename Unit>
   1551 bool TokenStreamCharsBase<Unit>::addLineOfContext(ErrorMetadata* err,
   1552                                                  uint32_t offset) const {
   1553  // Rename the variable to make meaning clearer: an offset into source units
   1554  // in Unit encoding.
   1555  size_t encodedOffset = offset;
   1556 
   1557  // These are also offsets into source units in Unit encoding.
   1558  size_t encodedWindowStart = sourceUnits.findWindowStart(encodedOffset);
   1559  size_t encodedWindowEnd = sourceUnits.findWindowEnd(encodedOffset);
   1560 
   1561  size_t encodedWindowLength = encodedWindowEnd - encodedWindowStart;
   1562  MOZ_ASSERT(encodedWindowLength <= SourceUnits::WindowRadius * 2);
   1563 
   1564  // Don't add a useless "line" of context when the window ends up empty
   1565  // because of an invalid encoding at the start of a line.
   1566  if (encodedWindowLength == 0) {
   1567    MOZ_ASSERT(err->lineOfContext == nullptr,
   1568               "ErrorMetadata::lineOfContext must be null so we don't "
   1569               "have to set the lineLength/tokenOffset fields");
   1570    return true;
   1571  }
   1572 
   1573  CharBuffer lineOfContext(fc);
   1574 
   1575  const Unit* encodedWindow = sourceUnits.codeUnitPtrAt(encodedWindowStart);
   1576  if (!FillCharBufferFromSourceNormalizingAsciiLineBreaks(
   1577          lineOfContext, encodedWindow, encodedWindow + encodedWindowLength)) {
   1578    return false;
   1579  }
   1580 
   1581  size_t utf16WindowLength = lineOfContext.length();
   1582 
   1583  // The windowed string is null-terminated.
   1584  if (!lineOfContext.append('\0')) {
   1585    return false;
   1586  }
   1587 
   1588  err->lineOfContext.reset(lineOfContext.extractOrCopyRawBuffer());
   1589  if (!err->lineOfContext) {
   1590    return false;
   1591  }
   1592 
   1593  size_t encodedTokenOffset = encodedOffset - encodedWindowStart;
   1594 
   1595  MOZ_ASSERT(encodedTokenOffset <= encodedWindowLength,
   1596             "token offset must be inside the window");
   1597 
   1598  // The length in UTF-8 code units of a code point is always greater than or
   1599  // equal to the same code point's length in UTF-16 code points.  ASCII code
   1600  // points are 1 unit in either encoding.  Code points in [U+0080, U+10000)
   1601  // are 2-3 UTF-8 code units to 1 UTF-16 code unit.  And code points in
   1602  // [U+10000, U+10FFFF] are 4 UTF-8 code units to 2 UTF-16 code units.
   1603  //
   1604  // Therefore, if encoded window length equals the length in UTF-16 (this is
   1605  // always the case for Unit=char16_t), the UTF-16 offsets are exactly the
   1606  // encoded offsets.  Otherwise we must convert offset/length from UTF-8 to
   1607  // UTF-16.
   1608  if constexpr (std::is_same_v<Unit, char16_t>) {
   1609    MOZ_ASSERT(utf16WindowLength == encodedWindowLength,
   1610               "UTF-16 to UTF-16 shouldn't change window length");
   1611    err->tokenOffset = encodedTokenOffset;
   1612    err->lineLength = encodedWindowLength;
   1613  } else {
   1614    static_assert(std::is_same_v<Unit, Utf8Unit>, "should only see UTF-8 here");
   1615 
   1616    bool simple = utf16WindowLength == encodedWindowLength;
   1617    MOZ_ASSERT(std::all_of(encodedWindow, encodedWindow + encodedWindowLength,
   1618                           [](Unit u) { return IsAscii(u); }) == simple,
   1619               "equal window lengths in UTF-8 should correspond only to "
   1620               "wholly-ASCII text");
   1621    if (simple) {
   1622      err->tokenOffset = encodedTokenOffset;
   1623      err->lineLength = encodedWindowLength;
   1624    } else {
   1625      sourceUnits.computeWindowOffsetAndLength(
   1626          encodedWindow, encodedTokenOffset, &err->tokenOffset,
   1627          encodedWindowLength, &err->lineLength);
   1628    }
   1629  }
   1630 
   1631  return true;
   1632 }
   1633 
   1634 template <typename Unit, class AnyCharsAccess>
   1635 bool TokenStreamSpecific<Unit, AnyCharsAccess>::computeErrorMetadata(
   1636    ErrorMetadata* err, const ErrorOffset& errorOffset) const {
   1637  if (errorOffset.is<NoOffset>()) {
   1638    anyCharsAccess().computeErrorMetadataNoOffset(err);
   1639    return true;
   1640  }
   1641 
   1642  uint32_t offset;
   1643  if (errorOffset.is<uint32_t>()) {
   1644    offset = errorOffset.as<uint32_t>();
   1645  } else {
   1646    offset = this->sourceUnits.offset();
   1647  }
   1648 
   1649  // This function's return value isn't a success/failure indication: it
   1650  // returns true if this TokenStream can be used to provide a line of
   1651  // context.
   1652  if (fillExceptingContext(err, offset)) {
   1653    // Add a line of context from this TokenStream to help with debugging.
   1654    return internalComputeLineOfContext(err, offset);
   1655  }
   1656 
   1657  // We can't fill in any more here.
   1658  return true;
   1659 }
   1660 
   1661 template <typename Unit, class AnyCharsAccess>
   1662 void TokenStreamSpecific<Unit, AnyCharsAccess>::reportIllegalCharacter(
   1663    int32_t cp) {
   1664  UniqueChars display = JS_smprintf("U+%04X", cp);
   1665  if (!display) {
   1666    ReportOutOfMemory(anyCharsAccess().fc);
   1667    return;
   1668  }
   1669  error(JSMSG_ILLEGAL_CHARACTER, display.get());
   1670 }
   1671 
   1672 // We have encountered a '\': check for a Unicode escape sequence after it.
   1673 // Return the length of the escape sequence and the encoded code point (by
   1674 // value) if we found a Unicode escape sequence, and skip all code units
   1675 // involed.  Otherwise, return 0 and don't advance along the buffer.
   1676 template <typename Unit, class AnyCharsAccess>
   1677 uint32_t GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchUnicodeEscape(
   1678    char32_t* codePoint) {
   1679  MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
   1680 
   1681  int32_t unit = getCodeUnit();
   1682  if (unit != 'u') {
   1683    // NOTE: |unit| may be EOF here.
   1684    ungetCodeUnit(unit);
   1685    MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
   1686    return 0;
   1687  }
   1688 
   1689  char16_t v;
   1690  unit = getCodeUnit();
   1691  if (IsAsciiHexDigit(unit) && this->sourceUnits.matchHexDigits(3, &v)) {
   1692    *codePoint = (AsciiAlphanumericToNumber(unit) << 12) | v;
   1693    return 5;
   1694  }
   1695 
   1696  if (unit == '{') {
   1697    return matchExtendedUnicodeEscape(codePoint);
   1698  }
   1699 
   1700  // NOTE: |unit| may be EOF here, so this ungets either one or two units.
   1701  ungetCodeUnit(unit);
   1702  ungetCodeUnit('u');
   1703  MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
   1704  return 0;
   1705 }
   1706 
   1707 template <typename Unit, class AnyCharsAccess>
   1708 uint32_t
   1709 GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchExtendedUnicodeEscape(
   1710    char32_t* codePoint) {
   1711  MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('{'));
   1712 
   1713  int32_t unit = getCodeUnit();
   1714 
   1715  // Skip leading zeroes.
   1716  uint32_t leadingZeroes = 0;
   1717  while (unit == '0') {
   1718    leadingZeroes++;
   1719    unit = getCodeUnit();
   1720  }
   1721 
   1722  size_t i = 0;
   1723  uint32_t code = 0;
   1724  while (IsAsciiHexDigit(unit) && i < 6) {
   1725    code = (code << 4) | AsciiAlphanumericToNumber(unit);
   1726    unit = getCodeUnit();
   1727    i++;
   1728  }
   1729 
   1730  uint32_t gotten =
   1731      2 +                  // 'u{'
   1732      leadingZeroes + i +  // significant hexdigits
   1733      (unit != EOF);       // subtract a get if it didn't contribute to length
   1734 
   1735  if (unit == '}' && (leadingZeroes > 0 || i > 0) &&
   1736      code <= unicode::NonBMPMax) {
   1737    *codePoint = code;
   1738    return gotten;
   1739  }
   1740 
   1741  this->sourceUnits.unskipCodeUnits(gotten);
   1742  MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
   1743  return 0;
   1744 }
   1745 
   1746 template <typename Unit, class AnyCharsAccess>
   1747 uint32_t
   1748 GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchUnicodeEscapeIdStart(
   1749    char32_t* codePoint) {
   1750  uint32_t length = matchUnicodeEscape(codePoint);
   1751  if (MOZ_LIKELY(length > 0)) {
   1752    if (MOZ_LIKELY(unicode::IsIdentifierStart(*codePoint))) {
   1753      return length;
   1754    }
   1755 
   1756    this->sourceUnits.unskipCodeUnits(length);
   1757  }
   1758 
   1759  MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
   1760  return 0;
   1761 }
   1762 
   1763 template <typename Unit, class AnyCharsAccess>
   1764 bool GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchUnicodeEscapeIdent(
   1765    char32_t* codePoint) {
   1766  uint32_t length = matchUnicodeEscape(codePoint);
   1767  if (MOZ_LIKELY(length > 0)) {
   1768    if (MOZ_LIKELY(unicode::IsIdentifierPart(*codePoint))) {
   1769      return true;
   1770    }
   1771 
   1772    this->sourceUnits.unskipCodeUnits(length);
   1773  }
   1774 
   1775  MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
   1776  return false;
   1777 }
   1778 
   1779 template <typename Unit, class AnyCharsAccess>
   1780 [[nodiscard]] bool
   1781 TokenStreamSpecific<Unit, AnyCharsAccess>::matchIdentifierStart(
   1782    IdentifierEscapes* sawEscape) {
   1783  int32_t unit = getCodeUnit();
   1784  if (unit == EOF) {
   1785    error(JSMSG_MISSING_PRIVATE_NAME);
   1786    return false;
   1787  }
   1788 
   1789  if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
   1790    if (unicode::IsIdentifierStart(char16_t(unit))) {
   1791      *sawEscape = IdentifierEscapes::None;
   1792      return true;
   1793    }
   1794 
   1795    if (unit == '\\') {
   1796      char32_t codePoint;
   1797      uint32_t escapeLength = matchUnicodeEscapeIdStart(&codePoint);
   1798      if (escapeLength != 0) {
   1799        *sawEscape = IdentifierEscapes::SawUnicodeEscape;
   1800        return true;
   1801      }
   1802 
   1803      // We could point "into" a mistyped escape, e.g. for "\u{41H}" we
   1804      // could point at the 'H'.  But we don't do that now, so the code
   1805      // unit after the '\' isn't necessarily bad, so just point at the
   1806      // start of the actually-invalid escape.
   1807      ungetCodeUnit('\\');
   1808      error(JSMSG_BAD_ESCAPE);
   1809      return false;
   1810    }
   1811  }
   1812 
   1813  // Unget the lead code unit before peeking at the full code point.
   1814  ungetCodeUnit(unit);
   1815 
   1816  PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint();
   1817  if (!peeked.isNone() && unicode::IsIdentifierStart(peeked.codePoint())) {
   1818    this->sourceUnits.consumeKnownCodePoint(peeked);
   1819 
   1820    *sawEscape = IdentifierEscapes::None;
   1821    return true;
   1822  }
   1823 
   1824  error(JSMSG_MISSING_PRIVATE_NAME);
   1825  return false;
   1826 }
   1827 
   1828 template <typename Unit, class AnyCharsAccess>
   1829 bool TokenStreamSpecific<Unit, AnyCharsAccess>::getDirectives(
   1830    bool isMultiline, bool shouldWarnDeprecated) {
   1831  // Match directive comments used in debugging, such as "//# sourceURL" and
   1832  // "//# sourceMappingURL". Use of "//@" instead of "//#" is deprecated.
   1833  //
   1834  // To avoid a crashing bug in IE, several JavaScript transpilers wrap single
   1835  // line comments containing a source mapping URL inside a multiline
   1836  // comment. To avoid potentially expensive lookahead and backtracking, we
   1837  // only check for this case if we encounter a '#' code unit.
   1838 
   1839  bool res = getDisplayURL(isMultiline, shouldWarnDeprecated) &&
   1840             getSourceMappingURL(isMultiline, shouldWarnDeprecated);
   1841  if (!res) {
   1842    badToken();
   1843  }
   1844 
   1845  return res;
   1846 }
   1847 
   1848 [[nodiscard]] bool TokenStreamCharsShared::copyCharBufferTo(
   1849    UniquePtr<char16_t[], JS::FreePolicy>* destination) {
   1850  size_t length = charBuffer.length();
   1851 
   1852  *destination = fc->getAllocator()->make_pod_array<char16_t>(length + 1);
   1853  if (!*destination) {
   1854    return false;
   1855  }
   1856 
   1857  std::copy(charBuffer.begin(), charBuffer.end(), destination->get());
   1858  (*destination)[length] = '\0';
   1859  return true;
   1860 }
   1861 
   1862 template <typename Unit, class AnyCharsAccess>
   1863 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::getDirective(
   1864    bool isMultiline, bool shouldWarnDeprecated, const char* directive,
   1865    uint8_t directiveLength, const char* errorMsgPragma,
   1866    UniquePtr<char16_t[], JS::FreePolicy>* destination) {
   1867  // Stop if we don't find |directive|.  (Note that |directive| must be
   1868  // ASCII, so there are no tricky encoding issues to consider in matching
   1869  // UTF-8/16-agnostically.)
   1870  if (!this->sourceUnits.matchCodeUnits(directive, directiveLength)) {
   1871    return true;
   1872  }
   1873 
   1874  if (shouldWarnDeprecated) {
   1875    if (!warning(JSMSG_DEPRECATED_PRAGMA, errorMsgPragma)) {
   1876      return false;
   1877    }
   1878  }
   1879 
   1880  this->charBuffer.clear();
   1881 
   1882  do {
   1883    int32_t unit = peekCodeUnit();
   1884    if (unit == EOF) {
   1885      break;
   1886    }
   1887 
   1888    if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
   1889      if (unicode::IsSpace(AssertedCast<Latin1Char>(unit))) {
   1890        break;
   1891      }
   1892 
   1893      consumeKnownCodeUnit(unit);
   1894 
   1895      // Debugging directives can occur in both single- and multi-line
   1896      // comments. If we're currently inside a multi-line comment, we
   1897      // also must recognize multi-line comment terminators.
   1898      if (isMultiline && unit == '*' && peekCodeUnit() == '/') {
   1899        ungetCodeUnit('*');
   1900        break;
   1901      }
   1902 
   1903      if (!this->charBuffer.append(unit)) {
   1904        return false;
   1905      }
   1906 
   1907      continue;
   1908    }
   1909 
   1910    // This ignores encoding errors: subsequent caller-side code to
   1911    // handle the remaining source text in the comment will do so.
   1912    PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint();
   1913    if (peeked.isNone() || unicode::IsSpace(peeked.codePoint())) {
   1914      break;
   1915    }
   1916 
   1917    MOZ_ASSERT(!IsLineTerminator(peeked.codePoint()),
   1918               "!IsSpace must imply !IsLineTerminator or else we'll fail to "
   1919               "maintain line-info/flags for EOL");
   1920    this->sourceUnits.consumeKnownCodePoint(peeked);
   1921 
   1922    if (!AppendCodePointToCharBuffer(this->charBuffer, peeked.codePoint())) {
   1923      return false;
   1924    }
   1925  } while (true);
   1926 
   1927  if (this->charBuffer.empty()) {
   1928    // The directive's URL was missing, but comments can contain anything,
   1929    // so it isn't an error.
   1930    return true;
   1931  }
   1932 
   1933  return copyCharBufferTo(destination);
   1934 }
   1935 
   1936 template <typename Unit, class AnyCharsAccess>
   1937 bool TokenStreamSpecific<Unit, AnyCharsAccess>::getDisplayURL(
   1938    bool isMultiline, bool shouldWarnDeprecated) {
   1939  // Match comments of the form "//# sourceURL=<url>" or
   1940  // "/\* //# sourceURL=<url> *\/"
   1941  //
   1942  // Note that while these are labeled "sourceURL" in the source text,
   1943  // internally we refer to it as a "displayURL" to distinguish what the
   1944  // developer would like to refer to the source as from the source's actual
   1945  // URL.
   1946 
   1947  static constexpr char sourceURLDirective[] = " sourceURL=";
   1948  constexpr uint8_t sourceURLDirectiveLength = js_strlen(sourceURLDirective);
   1949  return getDirective(isMultiline, shouldWarnDeprecated, sourceURLDirective,
   1950                      sourceURLDirectiveLength, "sourceURL",
   1951                      &anyCharsAccess().displayURL_);
   1952 }
   1953 
   1954 template <typename Unit, class AnyCharsAccess>
   1955 bool TokenStreamSpecific<Unit, AnyCharsAccess>::getSourceMappingURL(
   1956    bool isMultiline, bool shouldWarnDeprecated) {
   1957  // Match comments of the form "//# sourceMappingURL=<url>" or
   1958  // "/\* //# sourceMappingURL=<url> *\/"
   1959 
   1960  static constexpr char sourceMappingURLDirective[] = " sourceMappingURL=";
   1961  constexpr uint8_t sourceMappingURLDirectiveLength =
   1962      js_strlen(sourceMappingURLDirective);
   1963  return getDirective(isMultiline, shouldWarnDeprecated,
   1964                      sourceMappingURLDirective,
   1965                      sourceMappingURLDirectiveLength, "sourceMappingURL",
   1966                      &anyCharsAccess().sourceMapURL_);
   1967 }
   1968 
   1969 template <typename Unit, class AnyCharsAccess>
   1970 MOZ_ALWAYS_INLINE Token*
   1971 GeneralTokenStreamChars<Unit, AnyCharsAccess>::newTokenInternal(
   1972    TokenKind kind, TokenStart start, TokenKind* out) {
   1973  MOZ_ASSERT(kind < TokenKind::Limit);
   1974  MOZ_ASSERT(kind != TokenKind::Eol,
   1975             "TokenKind::Eol should never be used in an actual Token, only "
   1976             "returned by peekTokenSameLine()");
   1977 
   1978  TokenStreamAnyChars& anyChars = anyCharsAccess();
   1979  anyChars.flags.isDirtyLine = true;
   1980 
   1981  Token* token = anyChars.allocateToken();
   1982 
   1983  *out = token->type = kind;
   1984  token->pos = TokenPos(start.offset(), this->sourceUnits.offset());
   1985  MOZ_ASSERT(token->pos.begin <= token->pos.end);
   1986 
   1987  // NOTE: |token->modifier| is set in |newToken()| so that optimized,
   1988  // non-debug code won't do any work to pass a modifier-argument that will
   1989  // never be used.
   1990 
   1991  return token;
   1992 }
   1993 
   1994 template <typename Unit, class AnyCharsAccess>
   1995 MOZ_COLD bool GeneralTokenStreamChars<Unit, AnyCharsAccess>::badToken() {
   1996  // We didn't get a token, so don't set |flags.isDirtyLine|.
   1997  anyCharsAccess().flags.hadError = true;
   1998 
   1999  // Poisoning sourceUnits on error establishes an invariant: once an
   2000  // erroneous token has been seen, sourceUnits will not be consulted again.
   2001  // This is true because the parser will deal with the illegal token by
   2002  // aborting parsing immediately.
   2003  this->sourceUnits.poisonInDebug();
   2004 
   2005  return false;
   2006 };
   2007 
   2008 bool AppendCodePointToCharBuffer(CharBuffer& charBuffer, char32_t codePoint) {
   2009  MOZ_ASSERT(codePoint <= unicode::NonBMPMax,
   2010             "should only be processing code points validly decoded from UTF-8 "
   2011             "or WTF-16 source text (surrogate code points permitted)");
   2012 
   2013  char16_t units[2];
   2014  unsigned numUnits = 0;
   2015  unicode::UTF16Encode(codePoint, units, &numUnits);
   2016 
   2017  MOZ_ASSERT(numUnits == 1 || numUnits == 2,
   2018             "UTF-16 code points are only encoded in one or two units");
   2019 
   2020  if (!charBuffer.append(units[0])) {
   2021    return false;
   2022  }
   2023 
   2024  if (numUnits == 1) {
   2025    return true;
   2026  }
   2027 
   2028  return charBuffer.append(units[1]);
   2029 }
   2030 
   2031 template <typename Unit, class AnyCharsAccess>
   2032 bool TokenStreamSpecific<Unit, AnyCharsAccess>::putIdentInCharBuffer(
   2033    const Unit* identStart) {
   2034  const Unit* const originalAddress = this->sourceUnits.addressOfNextCodeUnit();
   2035  this->sourceUnits.setAddressOfNextCodeUnit(identStart);
   2036 
   2037  auto restoreNextRawCharAddress = MakeScopeExit([this, originalAddress]() {
   2038    this->sourceUnits.setAddressOfNextCodeUnit(originalAddress);
   2039  });
   2040 
   2041  this->charBuffer.clear();
   2042  do {
   2043    int32_t unit = getCodeUnit();
   2044    if (unit == EOF) {
   2045      break;
   2046    }
   2047 
   2048    char32_t codePoint;
   2049    if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
   2050      if (unicode::IsIdentifierPart(char16_t(unit)) || unit == '#') {
   2051        if (!this->charBuffer.append(unit)) {
   2052          return false;
   2053        }
   2054 
   2055        continue;
   2056      }
   2057 
   2058      if (unit != '\\' || !matchUnicodeEscapeIdent(&codePoint)) {
   2059        break;
   2060      }
   2061    } else {
   2062      // |restoreNextRawCharAddress| undoes all gets, and this function
   2063      // doesn't update line/column info.
   2064      char32_t cp;
   2065      if (!getNonAsciiCodePointDontNormalize(toUnit(unit), &cp)) {
   2066        return false;
   2067      }
   2068 
   2069      codePoint = cp;
   2070      if (!unicode::IsIdentifierPart(codePoint)) {
   2071        break;
   2072      }
   2073    }
   2074 
   2075    if (!AppendCodePointToCharBuffer(this->charBuffer, codePoint)) {
   2076      return false;
   2077    }
   2078  } while (true);
   2079 
   2080  return true;
   2081 }
   2082 
   2083 template <typename Unit, class AnyCharsAccess>
   2084 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::identifierName(
   2085    TokenStart start, const Unit* identStart, IdentifierEscapes escaping,
   2086    Modifier modifier, NameVisibility visibility, TokenKind* out) {
   2087  // Run the bad-token code for every path out of this function except the
   2088  // two success-cases.
   2089  auto noteBadToken = MakeScopeExit([this]() { this->badToken(); });
   2090 
   2091  // We've already consumed an initial code point in the identifer, to *know*
   2092  // that this is an identifier.  So no need to worry about not consuming any
   2093  // code points in the loop below.
   2094  int32_t unit;
   2095  while (true) {
   2096    unit = peekCodeUnit();
   2097    if (unit == EOF) {
   2098      break;
   2099    }
   2100 
   2101    if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
   2102      consumeKnownCodeUnit(unit);
   2103 
   2104      if (MOZ_UNLIKELY(
   2105              !unicode::IsIdentifierPart(static_cast<char16_t>(unit)))) {
   2106        // Handle a Unicode escape -- otherwise it's not part of the
   2107        // identifier.
   2108        char32_t codePoint;
   2109        if (unit != '\\' || !matchUnicodeEscapeIdent(&codePoint)) {
   2110          ungetCodeUnit(unit);
   2111          break;
   2112        }
   2113 
   2114        escaping = IdentifierEscapes::SawUnicodeEscape;
   2115      }
   2116    } else {
   2117      // This ignores encoding errors: subsequent caller-side code to
   2118      // handle source text after the IdentifierName will do so.
   2119      PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint();
   2120      if (peeked.isNone() || !unicode::IsIdentifierPart(peeked.codePoint())) {
   2121        break;
   2122      }
   2123 
   2124      MOZ_ASSERT(!IsLineTerminator(peeked.codePoint()),
   2125                 "IdentifierPart must guarantee !IsLineTerminator or "
   2126                 "else we'll fail to maintain line-info/flags for EOL");
   2127 
   2128      this->sourceUnits.consumeKnownCodePoint(peeked);
   2129    }
   2130  }
   2131 
   2132  TaggedParserAtomIndex atom;
   2133  if (MOZ_UNLIKELY(escaping == IdentifierEscapes::SawUnicodeEscape)) {
   2134    // Identifiers containing Unicode escapes have to be converted into
   2135    // tokenbuf before atomizing.
   2136    if (!putIdentInCharBuffer(identStart)) {
   2137      return false;
   2138    }
   2139 
   2140    atom = drainCharBufferIntoAtom();
   2141  } else {
   2142    // Escape-free identifiers can be created directly from sourceUnits.
   2143    const Unit* chars = identStart;
   2144    size_t length = this->sourceUnits.addressOfNextCodeUnit() - identStart;
   2145 
   2146    // Private identifiers start with a '#', and so cannot be reserved words.
   2147    if (visibility == NameVisibility::Public) {
   2148      // Represent reserved words lacking escapes as reserved word tokens.
   2149      if (const ReservedWordInfo* rw = FindReservedWord(chars, length)) {
   2150        noteBadToken.release();
   2151        newSimpleToken(rw->tokentype, start, modifier, out);
   2152        return true;
   2153      }
   2154    }
   2155 
   2156    atom = atomizeSourceChars(Span(chars, length));
   2157  }
   2158  if (!atom) {
   2159    return false;
   2160  }
   2161 
   2162  noteBadToken.release();
   2163  if (visibility == NameVisibility::Private) {
   2164    newPrivateNameToken(atom, start, modifier, out);
   2165    return true;
   2166  }
   2167  newNameToken(atom, start, modifier, out);
   2168  return true;
   2169 }
   2170 
   2171 enum FirstCharKind {
   2172  // A char16_t has the 'OneChar' kind if it, by itself, constitutes a valid
   2173  // token that cannot also be a prefix of a longer token.  E.g. ';' has the
   2174  // OneChar kind, but '+' does not, because '++' and '+=' are valid longer
   2175  // tokens
   2176  // that begin with '+'.
   2177  //
   2178  // The few token kinds satisfying these properties cover roughly 35--45%
   2179  // of the tokens seen in practice.
   2180  //
   2181  // We represent the 'OneChar' kind with any positive value less than
   2182  // TokenKind::Limit.  This representation lets us associate
   2183  // each one-char token char16_t with a TokenKind and thus avoid
   2184  // a subsequent char16_t-to-TokenKind conversion.
   2185  OneChar_Min = 0,
   2186  OneChar_Max = size_t(TokenKind::Limit) - 1,
   2187 
   2188  Space = size_t(TokenKind::Limit),
   2189  Ident,
   2190  Dec,
   2191  String,
   2192  EOL,
   2193  ZeroDigit,
   2194  Other,
   2195 
   2196  LastCharKind = Other
   2197 };
   2198 
   2199 // OneChar: 40,  41,  44,  58,  59,  91,  93,  123, 125, 126:
   2200 //          '(', ')', ',', ':', ';', '[', ']', '{', '}', '~'
   2201 // Ident:   36, 65..90, 95, 97..122: '$', 'A'..'Z', '_', 'a'..'z'
   2202 // Dot:     46: '.'
   2203 // Equals:  61: '='
   2204 // String:  34, 39, 96: '"', '\'', '`'
   2205 // Dec:     49..57: '1'..'9'
   2206 // Plus:    43: '+'
   2207 // ZeroDigit:  48: '0'
   2208 // Space:   9, 11, 12, 32: '\t', '\v', '\f', ' '
   2209 // EOL:     10, 13: '\n', '\r'
   2210 //
   2211 #define T_COMMA size_t(TokenKind::Comma)
   2212 #define T_COLON size_t(TokenKind::Colon)
   2213 #define T_BITNOT size_t(TokenKind::BitNot)
   2214 #define T_LP size_t(TokenKind::LeftParen)
   2215 #define T_RP size_t(TokenKind::RightParen)
   2216 #define T_SEMI size_t(TokenKind::Semi)
   2217 #define T_LB size_t(TokenKind::LeftBracket)
   2218 #define T_RB size_t(TokenKind::RightBracket)
   2219 #define T_LC size_t(TokenKind::LeftCurly)
   2220 #define T_RC size_t(TokenKind::RightCurly)
   2221 #define _______ Other
   2222 static const uint8_t firstCharKinds[] = {
   2223    // clang-format off
   2224 /*         0        1        2        3        4        5        6        7        8        9    */
   2225 /*   0+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______,   Space,
   2226 /*  10+ */     EOL,   Space,   Space,     EOL, _______, _______, _______, _______, _______, _______,
   2227 /*  20+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
   2228 /*  30+ */ _______, _______,   Space, _______,  String, _______,   Ident, _______, _______,  String,
   2229 /*  40+ */    T_LP,    T_RP, _______, _______, T_COMMA, _______, _______, _______,ZeroDigit,    Dec,
   2230 /*  50+ */     Dec,     Dec,     Dec,     Dec,     Dec,     Dec,     Dec,     Dec, T_COLON,  T_SEMI,
   2231 /*  60+ */ _______, _______, _______, _______, _______,   Ident,   Ident,   Ident,   Ident,   Ident,
   2232 /*  70+ */   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,
   2233 /*  80+ */   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,
   2234 /*  90+ */   Ident,    T_LB, _______,    T_RB, _______,   Ident,  String,   Ident,   Ident,   Ident,
   2235 /* 100+ */   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,
   2236 /* 110+ */   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,
   2237 /* 120+ */   Ident,   Ident,   Ident,    T_LC, _______,    T_RC,T_BITNOT, _______
   2238    // clang-format on
   2239 };
   2240 #undef T_COMMA
   2241 #undef T_COLON
   2242 #undef T_BITNOT
   2243 #undef T_LP
   2244 #undef T_RP
   2245 #undef T_SEMI
   2246 #undef T_LB
   2247 #undef T_RB
   2248 #undef T_LC
   2249 #undef T_RC
   2250 #undef _______
   2251 
   2252 static_assert(LastCharKind < (1 << (sizeof(firstCharKinds[0]) * 8)),
   2253              "Elements of firstCharKinds[] are too small");
   2254 
   2255 template <>
   2256 void SourceUnits<char16_t>::consumeRestOfSingleLineComment() {
   2257  while (MOZ_LIKELY(!atEnd())) {
   2258    char16_t unit = peekCodeUnit();
   2259    if (IsLineTerminator(unit)) {
   2260      return;
   2261    }
   2262 
   2263    consumeKnownCodeUnit(unit);
   2264  }
   2265 }
   2266 
   2267 template <>
   2268 void SourceUnits<Utf8Unit>::consumeRestOfSingleLineComment() {
   2269  while (MOZ_LIKELY(!atEnd())) {
   2270    const Utf8Unit unit = peekCodeUnit();
   2271    if (IsSingleUnitLineTerminator(unit)) {
   2272      return;
   2273    }
   2274 
   2275    if (MOZ_LIKELY(IsAscii(unit))) {
   2276      consumeKnownCodeUnit(unit);
   2277      continue;
   2278    }
   2279 
   2280    PeekedCodePoint<Utf8Unit> peeked = peekCodePoint();
   2281    if (peeked.isNone()) {
   2282      return;
   2283    }
   2284 
   2285    char32_t c = peeked.codePoint();
   2286    if (MOZ_UNLIKELY(c == unicode::LINE_SEPARATOR ||
   2287                     c == unicode::PARA_SEPARATOR)) {
   2288      return;
   2289    }
   2290 
   2291    consumeKnownCodePoint(peeked);
   2292  }
   2293 }
   2294 
   2295 template <typename Unit, class AnyCharsAccess>
   2296 [[nodiscard]] MOZ_ALWAYS_INLINE bool
   2297 TokenStreamSpecific<Unit, AnyCharsAccess>::matchInteger(
   2298    IsIntegerUnit isIntegerUnit, int32_t* nextUnit) {
   2299  int32_t unit = getCodeUnit();
   2300  if (!isIntegerUnit(unit)) {
   2301    *nextUnit = unit;
   2302    return true;
   2303  }
   2304  return matchIntegerAfterFirstDigit(isIntegerUnit, nextUnit);
   2305 }
   2306 
   2307 template <typename Unit, class AnyCharsAccess>
   2308 [[nodiscard]] MOZ_ALWAYS_INLINE bool
   2309 TokenStreamSpecific<Unit, AnyCharsAccess>::matchIntegerAfterFirstDigit(
   2310    IsIntegerUnit isIntegerUnit, int32_t* nextUnit) {
   2311  int32_t unit;
   2312  while (true) {
   2313    unit = getCodeUnit();
   2314    if (isIntegerUnit(unit)) {
   2315      continue;
   2316    }
   2317    if (unit != '_') {
   2318      break;
   2319    }
   2320    unit = getCodeUnit();
   2321    if (!isIntegerUnit(unit)) {
   2322      if (unit == '_') {
   2323        ungetCodeUnit(unit);
   2324        error(JSMSG_NUMBER_MULTIPLE_ADJACENT_UNDERSCORES);
   2325      } else {
   2326        ungetCodeUnit(unit);
   2327        ungetCodeUnit('_');
   2328        error(JSMSG_NUMBER_END_WITH_UNDERSCORE);
   2329      }
   2330      return false;
   2331    }
   2332  }
   2333 
   2334  *nextUnit = unit;
   2335  return true;
   2336 }
   2337 
   2338 template <typename Unit, class AnyCharsAccess>
   2339 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::decimalNumber(
   2340    int32_t unit, TokenStart start, const Unit* numStart, Modifier modifier,
   2341    TokenKind* out) {
   2342  // Run the bad-token code for every path out of this function except the
   2343  // one success-case.
   2344  auto noteBadToken = MakeScopeExit([this]() { this->badToken(); });
   2345 
   2346  // Consume integral component digits.
   2347  if (IsAsciiDigit(unit)) {
   2348    if (!matchIntegerAfterFirstDigit(IsAsciiDigit, &unit)) {
   2349      return false;
   2350    }
   2351  }
   2352 
   2353  // Numbers contain no escapes, so we can read directly from |sourceUnits|.
   2354  double dval;
   2355  bool isBigInt = false;
   2356  DecimalPoint decimalPoint = NoDecimal;
   2357  if (unit != '.' && unit != 'e' && unit != 'E' && unit != 'n') {
   2358    // NOTE: |unit| may be EOF here.
   2359    ungetCodeUnit(unit);
   2360 
   2361    // Most numbers are pure decimal integers without fractional component
   2362    // or exponential notation.  Handle that with optimized code.
   2363    if (!GetDecimalInteger(numStart, this->sourceUnits.addressOfNextCodeUnit(),
   2364                           &dval)) {
   2365      ReportOutOfMemory(this->fc);
   2366      return false;
   2367    }
   2368  } else if (unit == 'n') {
   2369    isBigInt = true;
   2370    unit = peekCodeUnit();
   2371  } else {
   2372    // Consume any decimal dot and fractional component.
   2373    if (unit == '.') {
   2374      decimalPoint = HasDecimal;
   2375      if (!matchInteger(IsAsciiDigit, &unit)) {
   2376        return false;
   2377      }
   2378    }
   2379 
   2380    // Consume any exponential notation.
   2381    if (unit == 'e' || unit == 'E') {
   2382      unit = getCodeUnit();
   2383      if (unit == '+' || unit == '-') {
   2384        unit = getCodeUnit();
   2385      }
   2386 
   2387      // Exponential notation must contain at least one digit.
   2388      if (!IsAsciiDigit(unit)) {
   2389        ungetCodeUnit(unit);
   2390        error(JSMSG_MISSING_EXPONENT);
   2391        return false;
   2392      }
   2393 
   2394      // Consume exponential digits.
   2395      if (!matchIntegerAfterFirstDigit(IsAsciiDigit, &unit)) {
   2396        return false;
   2397      }
   2398    }
   2399 
   2400    ungetCodeUnit(unit);
   2401 
   2402    if (!GetDecimal(numStart, this->sourceUnits.addressOfNextCodeUnit(),
   2403                    &dval)) {
   2404      ReportOutOfMemory(this->fc);
   2405      return false;
   2406    }
   2407  }
   2408 
   2409  // Number followed by IdentifierStart is an error.  (This is the only place
   2410  // in ECMAScript where token boundary is inadequate to properly separate
   2411  // two tokens, necessitating this unaesthetic lookahead.)
   2412  if (unit != EOF) {
   2413    if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
   2414      if (unicode::IsIdentifierStart(char16_t(unit))) {
   2415        error(JSMSG_IDSTART_AFTER_NUMBER);
   2416        return false;
   2417      }
   2418    } else {
   2419      // This ignores encoding errors: subsequent caller-side code to
   2420      // handle source text after the number will do so.
   2421      PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint();
   2422      if (!peeked.isNone() && unicode::IsIdentifierStart(peeked.codePoint())) {
   2423        error(JSMSG_IDSTART_AFTER_NUMBER);
   2424        return false;
   2425      }
   2426    }
   2427  }
   2428 
   2429  noteBadToken.release();
   2430 
   2431  if (isBigInt) {
   2432    return bigIntLiteral(start, modifier, out);
   2433  }
   2434 
   2435  newNumberToken(dval, decimalPoint, start, modifier, out);
   2436  return true;
   2437 }
   2438 
   2439 template <typename Unit, class AnyCharsAccess>
   2440 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::regexpLiteral(
   2441    TokenStart start, TokenKind* out) {
   2442  MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('/'));
   2443  this->charBuffer.clear();
   2444 
   2445  auto ProcessNonAsciiCodePoint = [this](int32_t lead) {
   2446    MOZ_ASSERT(lead != EOF);
   2447    MOZ_ASSERT(!this->isAsciiCodePoint(lead));
   2448 
   2449    char32_t codePoint;
   2450    if (!this->getNonAsciiCodePointDontNormalize(this->toUnit(lead),
   2451                                                 &codePoint)) {
   2452      return false;
   2453    }
   2454 
   2455    if (MOZ_UNLIKELY(codePoint == unicode::LINE_SEPARATOR ||
   2456                     codePoint == unicode::PARA_SEPARATOR)) {
   2457      this->sourceUnits.ungetLineOrParagraphSeparator();
   2458      this->error(JSMSG_UNTERMINATED_REGEXP);
   2459      return false;
   2460    }
   2461 
   2462    return AppendCodePointToCharBuffer(this->charBuffer, codePoint);
   2463  };
   2464 
   2465  auto ReportUnterminatedRegExp = [this](int32_t unit) {
   2466    this->ungetCodeUnit(unit);
   2467    this->error(JSMSG_UNTERMINATED_REGEXP);
   2468  };
   2469 
   2470  bool inCharClass = false;
   2471  do {
   2472    int32_t unit = getCodeUnit();
   2473    if (unit == EOF) {
   2474      ReportUnterminatedRegExp(unit);
   2475      return badToken();
   2476    }
   2477 
   2478    if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) {
   2479      if (!ProcessNonAsciiCodePoint(unit)) {
   2480        return badToken();
   2481      }
   2482 
   2483      continue;
   2484    }
   2485 
   2486    if (unit == '\\') {
   2487      if (!this->charBuffer.append(unit)) {
   2488        return badToken();
   2489      }
   2490 
   2491      unit = getCodeUnit();
   2492      if (unit == EOF) {
   2493        ReportUnterminatedRegExp(unit);
   2494        return badToken();
   2495      }
   2496 
   2497      // Fallthrough only handles ASCII code points, so
   2498      // deal with non-ASCII and skip everything else.
   2499      if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) {
   2500        if (!ProcessNonAsciiCodePoint(unit)) {
   2501          return badToken();
   2502        }
   2503 
   2504        continue;
   2505      }
   2506    } else if (unit == '[') {
   2507      inCharClass = true;
   2508    } else if (unit == ']') {
   2509      inCharClass = false;
   2510    } else if (unit == '/' && !inCharClass) {
   2511      // For IE compat, allow unescaped / in char classes.
   2512      break;
   2513    }
   2514 
   2515    // NOTE: Non-ASCII LineTerminators were handled by
   2516    //       ProcessNonAsciiCodePoint calls above.
   2517    if (unit == '\r' || unit == '\n') {
   2518      ReportUnterminatedRegExp(unit);
   2519      return badToken();
   2520    }
   2521 
   2522    MOZ_ASSERT(!IsLineTerminator(AssertedCast<char32_t>(unit)));
   2523    if (!this->charBuffer.append(unit)) {
   2524      return badToken();
   2525    }
   2526  } while (true);
   2527 
   2528  int32_t unit;
   2529  RegExpFlags reflags = RegExpFlag::NoFlags;
   2530  while (true) {
   2531    uint8_t flag;
   2532    unit = getCodeUnit();
   2533    if (unit == 'd') {
   2534      flag = RegExpFlag::HasIndices;
   2535    } else if (unit == 'g') {
   2536      flag = RegExpFlag::Global;
   2537    } else if (unit == 'i') {
   2538      flag = RegExpFlag::IgnoreCase;
   2539    } else if (unit == 'm') {
   2540      flag = RegExpFlag::Multiline;
   2541    } else if (unit == 's') {
   2542      flag = RegExpFlag::DotAll;
   2543    } else if (unit == 'u') {
   2544      flag = RegExpFlag::Unicode;
   2545    } else if (unit == 'v') {
   2546      flag = RegExpFlag::UnicodeSets;
   2547    } else if (unit == 'y') {
   2548      flag = RegExpFlag::Sticky;
   2549    } else if (IsAsciiAlpha(unit)) {
   2550      flag = RegExpFlag::NoFlags;
   2551    } else {
   2552      break;
   2553    }
   2554 
   2555    if ((reflags & flag) || flag == RegExpFlag::NoFlags) {
   2556      ungetCodeUnit(unit);
   2557      char buf[2] = {char(unit), '\0'};
   2558      error(JSMSG_BAD_REGEXP_FLAG, buf);
   2559      return badToken();
   2560    }
   2561 
   2562    // /u and /v flags are mutually exclusive.
   2563    if (((reflags & RegExpFlag::Unicode) && (flag & RegExpFlag::UnicodeSets)) ||
   2564        ((reflags & RegExpFlag::UnicodeSets) && (flag & RegExpFlag::Unicode))) {
   2565      ungetCodeUnit(unit);
   2566      char buf[2] = {char(unit), '\0'};
   2567      error(JSMSG_BAD_REGEXP_FLAG, buf);
   2568      return badToken();
   2569    }
   2570 
   2571    reflags |= flag;
   2572  }
   2573  ungetCodeUnit(unit);
   2574 
   2575  newRegExpToken(reflags, start, out);
   2576  return true;
   2577 }
   2578 
   2579 template <typename Unit, class AnyCharsAccess>
   2580 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::bigIntLiteral(
   2581    TokenStart start, Modifier modifier, TokenKind* out) {
   2582  MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == toUnit('n'));
   2583  MOZ_ASSERT(this->sourceUnits.offset() > start.offset());
   2584  uint32_t length = this->sourceUnits.offset() - start.offset();
   2585  MOZ_ASSERT(length >= 2);
   2586  this->charBuffer.clear();
   2587  mozilla::Range<const Unit> chars(
   2588      this->sourceUnits.codeUnitPtrAt(start.offset()), length);
   2589  for (uint32_t idx = 0; idx < length - 1; idx++) {
   2590    int32_t unit = CodeUnitValue(chars[idx]);
   2591    // Char buffer may start with a 0[bBoOxX] prefix, then follows with
   2592    // binary, octal, decimal, or hex digits.  Already checked by caller, as
   2593    // the "n" indicating bigint comes at the end.
   2594    MOZ_ASSERT(isAsciiCodePoint(unit));
   2595    // Skip over any separators.
   2596    if (unit == '_') {
   2597      continue;
   2598    }
   2599    if (!AppendCodePointToCharBuffer(this->charBuffer, unit)) {
   2600      return false;
   2601    }
   2602  }
   2603  newBigIntToken(start, modifier, out);
   2604  return true;
   2605 }
   2606 
   2607 template <typename Unit, class AnyCharsAccess>
   2608 void GeneralTokenStreamChars<Unit,
   2609                             AnyCharsAccess>::consumeOptionalHashbangComment() {
   2610  MOZ_ASSERT(this->sourceUnits.atStart(),
   2611             "HashBangComment can only appear immediately at the start of a "
   2612             "Script or Module");
   2613 
   2614  // HashbangComment ::
   2615  //   #!  SingleLineCommentChars_opt
   2616 
   2617  if (!matchCodeUnit('#')) {
   2618    // HashbangComment is optional at start of Script or Module.
   2619    return;
   2620  }
   2621 
   2622  if (!matchCodeUnit('!')) {
   2623    // # not followed by ! at start of Script or Module is an error, but normal
   2624    // parsing code will handle that error just fine if we let it.
   2625    ungetCodeUnit('#');
   2626    return;
   2627  }
   2628 
   2629  // This doesn't consume a concluding LineTerminator, and it stops consuming
   2630  // just before any encoding error.  The subsequent |getToken| call will call
   2631  // |getTokenInternal| below which will handle these possibilities.
   2632  this->sourceUnits.consumeRestOfSingleLineComment();
   2633 }
   2634 
   2635 template <typename Unit, class AnyCharsAccess>
   2636 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::getTokenInternal(
   2637    TokenKind* const ttp, const Modifier modifier) {
   2638  // Assume we'll fail: success cases will overwrite this.
   2639 #ifdef DEBUG
   2640  *ttp = TokenKind::Limit;
   2641 #endif
   2642  MOZ_MAKE_MEM_UNDEFINED(ttp, sizeof(*ttp));
   2643 
   2644  // This loop runs more than once only when whitespace or comments are
   2645  // encountered.
   2646  do {
   2647    int32_t unit = peekCodeUnit();
   2648    if (MOZ_UNLIKELY(unit == EOF)) {
   2649      MOZ_ASSERT(this->sourceUnits.atEnd());
   2650      anyCharsAccess().flags.isEOF = true;
   2651      TokenStart start(this->sourceUnits, 0);
   2652      newSimpleToken(TokenKind::Eof, start, modifier, ttp);
   2653      return true;
   2654    }
   2655 
   2656    if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) {
   2657      // Non-ASCII code points can only be identifiers or whitespace.  It would
   2658      // be nice to compute these *after* discarding whitespace, but IN A WORLD
   2659      // where |unicode::IsSpace| requires consuming a variable number of code
   2660      // units, it's easier to assume it's an identifier and maybe do a little
   2661      // wasted work, than to unget and compute and reget if whitespace.
   2662      TokenStart start(this->sourceUnits, 0);
   2663      const Unit* identStart = this->sourceUnits.addressOfNextCodeUnit();
   2664 
   2665      PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint();
   2666      if (peeked.isNone()) {
   2667        MOZ_ALWAYS_FALSE(getCodePoint());
   2668        return badToken();
   2669      }
   2670 
   2671      char32_t cp = peeked.codePoint();
   2672      if (unicode::IsSpace(cp)) {
   2673        this->sourceUnits.consumeKnownCodePoint(peeked);
   2674        if (IsLineTerminator(cp)) {
   2675          if (!updateLineInfoForEOL()) {
   2676            return badToken();
   2677          }
   2678 
   2679          anyCharsAccess().updateFlagsForEOL();
   2680        }
   2681 
   2682        continue;
   2683      }
   2684 
   2685      static_assert(isAsciiCodePoint('$'),
   2686                    "IdentifierStart contains '$', but as "
   2687                    "!IsUnicodeIDStart('$'), ensure that '$' is never "
   2688                    "handled here");
   2689      static_assert(isAsciiCodePoint('_'),
   2690                    "IdentifierStart contains '_', but as "
   2691                    "!IsUnicodeIDStart('_'), ensure that '_' is never "
   2692                    "handled here");
   2693 
   2694      if (MOZ_LIKELY(unicode::IsUnicodeIDStart(cp))) {
   2695        this->sourceUnits.consumeKnownCodePoint(peeked);
   2696        MOZ_ASSERT(!IsLineTerminator(cp),
   2697                   "IdentifierStart must guarantee !IsLineTerminator "
   2698                   "or else we'll fail to maintain line-info/flags "
   2699                   "for EOL here");
   2700 
   2701        return identifierName(start, identStart, IdentifierEscapes::None,
   2702                              modifier, NameVisibility::Public, ttp);
   2703      }
   2704 
   2705      reportIllegalCharacter(cp);
   2706      return badToken();
   2707    }  // !isAsciiCodePoint(unit)
   2708 
   2709    consumeKnownCodeUnit(unit);
   2710 
   2711    // Get the token kind, based on the first char.  The ordering of c1kind
   2712    // comparison is based on the frequency of tokens in real code:
   2713    // Parsemark (which represents typical JS code on the web) and the
   2714    // Unreal demo (which represents asm.js code).
   2715    //
   2716    //                  Parsemark   Unreal
   2717    //  OneChar         32.9%       39.7%
   2718    //  Space           25.0%        0.6%
   2719    //  Ident           19.2%       36.4%
   2720    //  Dec              7.2%        5.1%
   2721    //  String           7.9%        0.0%
   2722    //  EOL              1.7%        0.0%
   2723    //  ZeroDigit        0.4%        4.9%
   2724    //  Other            5.7%       13.3%
   2725    //
   2726    // The ordering is based mostly only Parsemark frequencies, with Unreal
   2727    // frequencies used to break close categories (e.g. |Dec| and
   2728    // |String|).  |Other| is biggish, but no other token kind is common
   2729    // enough for it to be worth adding extra values to FirstCharKind.
   2730    FirstCharKind c1kind = FirstCharKind(firstCharKinds[unit]);
   2731 
   2732    // Look for an unambiguous single-char token.
   2733    //
   2734    if (c1kind <= OneChar_Max) {
   2735      TokenStart start(this->sourceUnits, -1);
   2736      newSimpleToken(TokenKind(c1kind), start, modifier, ttp);
   2737      return true;
   2738    }
   2739 
   2740    // Skip over non-EOL whitespace chars.
   2741    //
   2742    if (c1kind == Space) {
   2743      continue;
   2744    }
   2745 
   2746    // Look for an identifier.
   2747    //
   2748    if (c1kind == Ident) {
   2749      TokenStart start(this->sourceUnits, -1);
   2750      return identifierName(
   2751          start, this->sourceUnits.addressOfNextCodeUnit() - 1,
   2752          IdentifierEscapes::None, modifier, NameVisibility::Public, ttp);
   2753    }
   2754 
   2755    // Look for a decimal number.
   2756    //
   2757    if (c1kind == Dec) {
   2758      TokenStart start(this->sourceUnits, -1);
   2759      const Unit* numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
   2760      return decimalNumber(unit, start, numStart, modifier, ttp);
   2761    }
   2762 
   2763    // Look for a string or a template string.
   2764    //
   2765    if (c1kind == String) {
   2766      return getStringOrTemplateToken(static_cast<char>(unit), modifier, ttp);
   2767    }
   2768 
   2769    // Skip over EOL chars, updating line state along the way.
   2770    //
   2771    if (c1kind == EOL) {
   2772      if (unit == '\r') {
   2773        matchLineTerminator('\n');
   2774      }
   2775 
   2776      if (!updateLineInfoForEOL()) {
   2777        return badToken();
   2778      }
   2779 
   2780      anyCharsAccess().updateFlagsForEOL();
   2781      continue;
   2782    }
   2783 
   2784    // From a '0', look for a hexadecimal, binary, octal, or "noctal" (a
   2785    // number starting with '0' that contains '8' or '9' and is treated as
   2786    // decimal) number.
   2787    //
   2788    if (c1kind == ZeroDigit) {
   2789      TokenStart start(this->sourceUnits, -1);
   2790      int radix;
   2791      bool isBigInt = false;
   2792      const Unit* numStart;
   2793      unit = getCodeUnit();
   2794      if (unit == 'x' || unit == 'X') {
   2795        radix = 16;
   2796        unit = getCodeUnit();
   2797        if (!IsAsciiHexDigit(unit)) {
   2798          // NOTE: |unit| may be EOF here.
   2799          ungetCodeUnit(unit);
   2800          error(JSMSG_MISSING_HEXDIGITS);
   2801          return badToken();
   2802        }
   2803 
   2804        // one past the '0x'
   2805        numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
   2806 
   2807        if (!matchIntegerAfterFirstDigit(IsAsciiHexDigit, &unit)) {
   2808          return badToken();
   2809        }
   2810      } else if (unit == 'b' || unit == 'B') {
   2811        radix = 2;
   2812        unit = getCodeUnit();
   2813        if (!IsAsciiBinary(unit)) {
   2814          // NOTE: |unit| may be EOF here.
   2815          ungetCodeUnit(unit);
   2816          error(JSMSG_MISSING_BINARY_DIGITS);
   2817          return badToken();
   2818        }
   2819 
   2820        // one past the '0b'
   2821        numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
   2822 
   2823        if (!matchIntegerAfterFirstDigit(IsAsciiBinary, &unit)) {
   2824          return badToken();
   2825        }
   2826      } else if (unit == 'o' || unit == 'O') {
   2827        radix = 8;
   2828        unit = getCodeUnit();
   2829        if (!IsAsciiOctal(unit)) {
   2830          // NOTE: |unit| may be EOF here.
   2831          ungetCodeUnit(unit);
   2832          error(JSMSG_MISSING_OCTAL_DIGITS);
   2833          return badToken();
   2834        }
   2835 
   2836        // one past the '0o'
   2837        numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
   2838 
   2839        if (!matchIntegerAfterFirstDigit(IsAsciiOctal, &unit)) {
   2840          return badToken();
   2841        }
   2842      } else if (IsAsciiDigit(unit)) {
   2843        // Reject octal literals that appear in strict mode code.
   2844        if (!strictModeError(JSMSG_DEPRECATED_OCTAL_LITERAL)) {
   2845          return badToken();
   2846        }
   2847 
   2848        // The above test doesn't catch a few edge cases; see
   2849        // |GeneralParser::maybeParseDirective|.  Record the violation so that
   2850        // that function can handle them.
   2851        anyCharsAccess().setSawDeprecatedOctalLiteral();
   2852 
   2853        radix = 8;
   2854        // one past the '0'
   2855        numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
   2856 
   2857        bool nonOctalDecimalIntegerLiteral = false;
   2858        do {
   2859          if (unit >= '8') {
   2860            nonOctalDecimalIntegerLiteral = true;
   2861          }
   2862          unit = getCodeUnit();
   2863        } while (IsAsciiDigit(unit));
   2864 
   2865        if (unit == '_') {
   2866          ungetCodeUnit(unit);
   2867          error(JSMSG_SEPARATOR_IN_ZERO_PREFIXED_NUMBER);
   2868          return badToken();
   2869        }
   2870 
   2871        if (unit == 'n') {
   2872          ungetCodeUnit(unit);
   2873          error(JSMSG_BIGINT_INVALID_SYNTAX);
   2874          return badToken();
   2875        }
   2876 
   2877        if (nonOctalDecimalIntegerLiteral) {
   2878          // Use the decimal scanner for the rest of the number.
   2879          return decimalNumber(unit, start, numStart, modifier, ttp);
   2880        }
   2881      } else if (unit == '_') {
   2882        // Give a more explicit error message when '_' is used after '0'.
   2883        ungetCodeUnit(unit);
   2884        error(JSMSG_SEPARATOR_IN_ZERO_PREFIXED_NUMBER);
   2885        return badToken();
   2886      } else {
   2887        // '0' not followed by [XxBbOo0-9_];  scan as a decimal number.
   2888        ungetCodeUnit(unit);
   2889        numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;  // The '0'.
   2890        return decimalNumber('0', start, numStart, modifier, ttp);
   2891      }
   2892 
   2893      if (unit == 'n') {
   2894        isBigInt = true;
   2895        unit = peekCodeUnit();
   2896      } else {
   2897        ungetCodeUnit(unit);
   2898      }
   2899 
   2900      // Error if an identifier-start code point appears immediately
   2901      // after the number.  Somewhat surprisingly, if we don't check
   2902      // here, we'll never check at all.
   2903      if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
   2904        if (unicode::IsIdentifierStart(char16_t(unit))) {
   2905          error(JSMSG_IDSTART_AFTER_NUMBER);
   2906          return badToken();
   2907        }
   2908      } else if (MOZ_LIKELY(unit != EOF)) {
   2909        // This ignores encoding errors: subsequent caller-side code to
   2910        // handle source text after the number will do so.
   2911        PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint();
   2912        if (!peeked.isNone() &&
   2913            unicode::IsIdentifierStart(peeked.codePoint())) {
   2914          error(JSMSG_IDSTART_AFTER_NUMBER);
   2915          return badToken();
   2916        }
   2917      }
   2918 
   2919      if (isBigInt) {
   2920        return bigIntLiteral(start, modifier, ttp);
   2921      }
   2922 
   2923      double dval;
   2924      if (!GetFullInteger(numStart, this->sourceUnits.addressOfNextCodeUnit(),
   2925                          radix, IntegerSeparatorHandling::SkipUnderscore,
   2926                          &dval)) {
   2927        ReportOutOfMemory(this->fc);
   2928        return badToken();
   2929      }
   2930      newNumberToken(dval, NoDecimal, start, modifier, ttp);
   2931      return true;
   2932    }
   2933 
   2934    MOZ_ASSERT(c1kind == Other);
   2935 
   2936    // This handles everything else.  Simple tokens distinguished solely by
   2937    // TokenKind should set |simpleKind| and break, to share simple-token
   2938    // creation code for all such tokens.  All other tokens must be handled
   2939    // by returning (or by continuing from the loop enclosing this).
   2940    //
   2941    TokenStart start(this->sourceUnits, -1);
   2942    TokenKind simpleKind;
   2943 #ifdef DEBUG
   2944    simpleKind = TokenKind::Limit;  // sentinel value for code after switch
   2945 #endif
   2946 
   2947    // The block a ways above eliminated all non-ASCII, so cast to the
   2948    // smallest type possible to assist the C++ compiler.
   2949    switch (AssertedCast<uint8_t>(CodeUnitValue(toUnit(unit)))) {
   2950      case '.':
   2951        if (IsAsciiDigit(peekCodeUnit())) {
   2952          return decimalNumber('.', start,
   2953                               this->sourceUnits.addressOfNextCodeUnit() - 1,
   2954                               modifier, ttp);
   2955        }
   2956 
   2957        unit = getCodeUnit();
   2958        if (unit == '.') {
   2959          if (matchCodeUnit('.')) {
   2960            simpleKind = TokenKind::TripleDot;
   2961            break;
   2962          }
   2963        }
   2964 
   2965        // NOTE: |unit| may be EOF here.  A stray '.' at EOF would be an
   2966        //       error, but subsequent code will handle it.
   2967        ungetCodeUnit(unit);
   2968 
   2969        simpleKind = TokenKind::Dot;
   2970        break;
   2971 
   2972      case '#': {
   2973        TokenStart start(this->sourceUnits, -1);
   2974        const Unit* identStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
   2975        IdentifierEscapes sawEscape;
   2976        if (!matchIdentifierStart(&sawEscape)) {
   2977          return badToken();
   2978        }
   2979        return identifierName(start, identStart, sawEscape, modifier,
   2980                              NameVisibility::Private, ttp);
   2981      }
   2982 
   2983      case '=':
   2984        if (matchCodeUnit('=')) {
   2985          simpleKind = matchCodeUnit('=') ? TokenKind::StrictEq : TokenKind::Eq;
   2986        } else if (matchCodeUnit('>')) {
   2987          simpleKind = TokenKind::Arrow;
   2988        } else {
   2989          simpleKind = TokenKind::Assign;
   2990        }
   2991        break;
   2992 
   2993      case '+':
   2994        if (matchCodeUnit('+')) {
   2995          simpleKind = TokenKind::Inc;
   2996        } else {
   2997          simpleKind =
   2998              matchCodeUnit('=') ? TokenKind::AddAssign : TokenKind::Add;
   2999        }
   3000        break;
   3001 
   3002      case '\\': {
   3003        char32_t codePoint;
   3004        if (uint32_t escapeLength = matchUnicodeEscapeIdStart(&codePoint)) {
   3005          return identifierName(
   3006              start,
   3007              this->sourceUnits.addressOfNextCodeUnit() - escapeLength - 1,
   3008              IdentifierEscapes::SawUnicodeEscape, modifier,
   3009              NameVisibility::Public, ttp);
   3010        }
   3011 
   3012        // We could point "into" a mistyped escape, e.g. for "\u{41H}" we
   3013        // could point at the 'H'.  But we don't do that now, so the code
   3014        // unit after the '\' isn't necessarily bad, so just point at the
   3015        // start of the actually-invalid escape.
   3016        ungetCodeUnit('\\');
   3017        error(JSMSG_BAD_ESCAPE);
   3018        return badToken();
   3019      }
   3020 
   3021      case '|':
   3022        if (matchCodeUnit('|')) {
   3023          simpleKind = matchCodeUnit('=') ? TokenKind::OrAssign : TokenKind::Or;
   3024        } else {
   3025          simpleKind =
   3026              matchCodeUnit('=') ? TokenKind::BitOrAssign : TokenKind::BitOr;
   3027        }
   3028        break;
   3029 
   3030      case '^':
   3031        simpleKind =
   3032            matchCodeUnit('=') ? TokenKind::BitXorAssign : TokenKind::BitXor;
   3033        break;
   3034 
   3035      case '&':
   3036        if (matchCodeUnit('&')) {
   3037          simpleKind =
   3038              matchCodeUnit('=') ? TokenKind::AndAssign : TokenKind::And;
   3039        } else {
   3040          simpleKind =
   3041              matchCodeUnit('=') ? TokenKind::BitAndAssign : TokenKind::BitAnd;
   3042        }
   3043        break;
   3044 
   3045      case '?':
   3046        if (matchCodeUnit('.')) {
   3047          unit = getCodeUnit();
   3048          if (IsAsciiDigit(unit)) {
   3049            // if the code unit is followed by a number, for example it has the
   3050            // following form `<...> ?.5 <..> then it should be treated as a
   3051            // ternary rather than as an optional chain
   3052            simpleKind = TokenKind::Hook;
   3053            ungetCodeUnit(unit);
   3054            ungetCodeUnit('.');
   3055          } else {
   3056            ungetCodeUnit(unit);
   3057            simpleKind = TokenKind::OptionalChain;
   3058          }
   3059        } else if (matchCodeUnit('?')) {
   3060          simpleKind = matchCodeUnit('=') ? TokenKind::CoalesceAssign
   3061                                          : TokenKind::Coalesce;
   3062        } else {
   3063          simpleKind = TokenKind::Hook;
   3064        }
   3065        break;
   3066 
   3067      case '!':
   3068        if (matchCodeUnit('=')) {
   3069          simpleKind = matchCodeUnit('=') ? TokenKind::StrictNe : TokenKind::Ne;
   3070        } else {
   3071          simpleKind = TokenKind::Not;
   3072        }
   3073        break;
   3074 
   3075      case '<':
   3076        if (anyCharsAccess().options().allowHTMLComments) {
   3077          // Treat HTML begin-comment as comment-till-end-of-line.
   3078          if (matchCodeUnit('!')) {
   3079            if (matchCodeUnit('-')) {
   3080              if (matchCodeUnit('-')) {
   3081                this->sourceUnits.consumeRestOfSingleLineComment();
   3082                continue;
   3083              }
   3084              ungetCodeUnit('-');
   3085            }
   3086            ungetCodeUnit('!');
   3087          }
   3088        }
   3089        if (matchCodeUnit('<')) {
   3090          simpleKind =
   3091              matchCodeUnit('=') ? TokenKind::LshAssign : TokenKind::Lsh;
   3092        } else {
   3093          simpleKind = matchCodeUnit('=') ? TokenKind::Le : TokenKind::Lt;
   3094        }
   3095        break;
   3096 
   3097      case '>':
   3098        if (matchCodeUnit('>')) {
   3099          if (matchCodeUnit('>')) {
   3100            simpleKind =
   3101                matchCodeUnit('=') ? TokenKind::UrshAssign : TokenKind::Ursh;
   3102          } else {
   3103            simpleKind =
   3104                matchCodeUnit('=') ? TokenKind::RshAssign : TokenKind::Rsh;
   3105          }
   3106        } else {
   3107          simpleKind = matchCodeUnit('=') ? TokenKind::Ge : TokenKind::Gt;
   3108        }
   3109        break;
   3110 
   3111      case '*':
   3112        if (matchCodeUnit('*')) {
   3113          simpleKind =
   3114              matchCodeUnit('=') ? TokenKind::PowAssign : TokenKind::Pow;
   3115        } else {
   3116          simpleKind =
   3117              matchCodeUnit('=') ? TokenKind::MulAssign : TokenKind::Mul;
   3118        }
   3119        break;
   3120 
   3121      case '/':
   3122        // Look for a single-line comment.
   3123        if (matchCodeUnit('/')) {
   3124          unit = getCodeUnit();
   3125          if (unit == '@' || unit == '#') {
   3126            bool shouldWarn = unit == '@';
   3127            if (!getDirectives(false, shouldWarn)) {
   3128              return false;
   3129            }
   3130          } else {
   3131            // NOTE: |unit| may be EOF here.
   3132            ungetCodeUnit(unit);
   3133          }
   3134 
   3135          this->sourceUnits.consumeRestOfSingleLineComment();
   3136          continue;
   3137        }
   3138 
   3139        // Look for a multi-line comment.
   3140        if (matchCodeUnit('*')) {
   3141          TokenStreamAnyChars& anyChars = anyCharsAccess();
   3142          unsigned linenoBefore = anyChars.lineno;
   3143 
   3144          do {
   3145            int32_t unit = getCodeUnit();
   3146            if (unit == EOF) {
   3147              error(JSMSG_UNTERMINATED_COMMENT);
   3148              return badToken();
   3149            }
   3150 
   3151            if (unit == '*' && matchCodeUnit('/')) {
   3152              break;
   3153            }
   3154 
   3155            if (unit == '@' || unit == '#') {
   3156              bool shouldWarn = unit == '@';
   3157              if (!getDirectives(true, shouldWarn)) {
   3158                return badToken();
   3159              }
   3160            } else if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
   3161              if (!getFullAsciiCodePoint(unit)) {
   3162                return badToken();
   3163              }
   3164            } else {
   3165              char32_t codePoint;
   3166              if (!getNonAsciiCodePoint(unit, &codePoint)) {
   3167                return badToken();
   3168              }
   3169            }
   3170          } while (true);
   3171 
   3172          if (linenoBefore != anyChars.lineno) {
   3173            anyChars.updateFlagsForEOL();
   3174          }
   3175 
   3176          continue;
   3177        }
   3178 
   3179        // Look for a regexp.
   3180        if (modifier == SlashIsRegExp) {
   3181          return regexpLiteral(start, ttp);
   3182        }
   3183 
   3184        simpleKind = matchCodeUnit('=') ? TokenKind::DivAssign : TokenKind::Div;
   3185        break;
   3186 
   3187      case '%':
   3188        simpleKind = matchCodeUnit('=') ? TokenKind::ModAssign : TokenKind::Mod;
   3189        break;
   3190 
   3191      case '-':
   3192        if (matchCodeUnit('-')) {
   3193          if (anyCharsAccess().options().allowHTMLComments &&
   3194              !anyCharsAccess().flags.isDirtyLine) {
   3195            if (matchCodeUnit('>')) {
   3196              this->sourceUnits.consumeRestOfSingleLineComment();
   3197              continue;
   3198            }
   3199          }
   3200 
   3201          simpleKind = TokenKind::Dec;
   3202        } else {
   3203          simpleKind =
   3204              matchCodeUnit('=') ? TokenKind::SubAssign : TokenKind::Sub;
   3205        }
   3206        break;
   3207 
   3208 #ifdef ENABLE_DECORATORS
   3209      case '@':
   3210        simpleKind = TokenKind::At;
   3211        break;
   3212 #endif
   3213 
   3214      default:
   3215        // We consumed a bad ASCII code point/unit.  Put it back so the
   3216        // error location is the bad code point.
   3217        ungetCodeUnit(unit);
   3218        reportIllegalCharacter(unit);
   3219        return badToken();
   3220    }  // switch (AssertedCast<uint8_t>(CodeUnitValue(toUnit(unit))))
   3221 
   3222    MOZ_ASSERT(simpleKind != TokenKind::Limit,
   3223               "switch-statement should have set |simpleKind| before "
   3224               "breaking");
   3225 
   3226    newSimpleToken(simpleKind, start, modifier, ttp);
   3227    return true;
   3228  } while (true);
   3229 }
   3230 
   3231 template <typename Unit, class AnyCharsAccess>
   3232 bool TokenStreamSpecific<Unit, AnyCharsAccess>::getStringOrTemplateToken(
   3233    char untilChar, Modifier modifier, TokenKind* out) {
   3234  MOZ_ASSERT(untilChar == '\'' || untilChar == '"' || untilChar == '`',
   3235             "unexpected string/template literal delimiter");
   3236 
   3237  bool parsingTemplate = (untilChar == '`');
   3238  bool templateHead = false;
   3239 
   3240  TokenStart start(this->sourceUnits, -1);
   3241  this->charBuffer.clear();
   3242 
   3243  // Run the bad-token code for every path out of this function except the
   3244  // one success-case.
   3245  auto noteBadToken = MakeScopeExit([this]() { this->badToken(); });
   3246 
   3247  auto ReportPrematureEndOfLiteral = [this, untilChar](unsigned errnum) {
   3248    // Unicode separators aren't end-of-line in template or (as of
   3249    // recently) string literals, so this assertion doesn't allow them.
   3250    MOZ_ASSERT(this->sourceUnits.atEnd() ||
   3251                   this->sourceUnits.peekCodeUnit() == Unit('\r') ||
   3252                   this->sourceUnits.peekCodeUnit() == Unit('\n'),
   3253               "must be parked at EOF or EOL to call this function");
   3254 
   3255    // The various errors reported here include language like "in a ''
   3256    // literal" or similar, with '' being '', "", or `` as appropriate.
   3257    const char delimiters[] = {untilChar, untilChar, '\0'};
   3258 
   3259    this->error(errnum, delimiters);
   3260    return;
   3261  };
   3262 
   3263  // We need to detect any of these chars:  " or ', \n (or its
   3264  // equivalents), \\, EOF.  Because we detect EOL sequences here and
   3265  // put them back immediately, we can use getCodeUnit().
   3266  int32_t unit;
   3267  while ((unit = getCodeUnit()) != untilChar) {
   3268    if (unit == EOF) {
   3269      ReportPrematureEndOfLiteral(JSMSG_EOF_BEFORE_END_OF_LITERAL);
   3270      return false;
   3271    }
   3272 
   3273    // Non-ASCII code points are always directly appended -- even
   3274    // U+2028 LINE SEPARATOR and U+2029 PARAGRAPH SEPARATOR that are
   3275    // ordinarily LineTerminatorSequences.  (They contribute their literal
   3276    // values to template and [as of recently] string literals, but they're
   3277    // line terminators when computing line/column coordinates.)  Handle
   3278    // the non-ASCII case early for readability.
   3279    if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) {
   3280      char32_t cp;
   3281      if (!getNonAsciiCodePointDontNormalize(toUnit(unit), &cp)) {
   3282        return false;
   3283      }
   3284 
   3285      if (MOZ_UNLIKELY(cp == unicode::LINE_SEPARATOR ||
   3286                       cp == unicode::PARA_SEPARATOR)) {
   3287        if (!updateLineInfoForEOL()) {
   3288          return false;
   3289        }
   3290 
   3291        anyCharsAccess().updateFlagsForEOL();
   3292      } else {
   3293        MOZ_ASSERT(!IsLineTerminator(cp));
   3294      }
   3295 
   3296      if (!AppendCodePointToCharBuffer(this->charBuffer, cp)) {
   3297        return false;
   3298      }
   3299 
   3300      continue;
   3301    }
   3302 
   3303    if (unit == '\\') {
   3304      // When parsing templates, we don't immediately report errors for
   3305      // invalid escapes; these are handled by the parser.  We don't
   3306      // append to charBuffer in those cases because it won't be read.
   3307      unit = getCodeUnit();
   3308      if (unit == EOF) {
   3309        ReportPrematureEndOfLiteral(JSMSG_EOF_IN_ESCAPE_IN_LITERAL);
   3310        return false;
   3311      }
   3312 
   3313      // Non-ASCII |unit| isn't handled by code after this, so dedicate
   3314      // an unlikely special-case to it and then continue.
   3315      if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) {
   3316        char32_t codePoint;
   3317        if (!getNonAsciiCodePoint(unit, &codePoint)) {
   3318          return false;
   3319        }
   3320 
   3321        // If we consumed U+2028 LINE SEPARATOR or U+2029 PARAGRAPH
   3322        // SEPARATOR, they'll be normalized to '\n'.  '\' followed by
   3323        // LineContinuation represents no code points, so don't append
   3324        // in this case.
   3325        if (codePoint != '\n') {
   3326          if (!AppendCodePointToCharBuffer(this->charBuffer, codePoint)) {
   3327            return false;
   3328          }
   3329        }
   3330 
   3331        continue;
   3332      }
   3333 
   3334      // The block above eliminated all non-ASCII, so cast to the
   3335      // smallest type possible to assist the C++ compiler.
   3336      switch (AssertedCast<uint8_t>(CodeUnitValue(toUnit(unit)))) {
   3337        case 'b':
   3338          unit = '\b';
   3339          break;
   3340        case 'f':
   3341          unit = '\f';
   3342          break;
   3343        case 'n':
   3344          unit = '\n';
   3345          break;
   3346        case 'r':
   3347          unit = '\r';
   3348          break;
   3349        case 't':
   3350          unit = '\t';
   3351          break;
   3352        case 'v':
   3353          unit = '\v';
   3354          break;
   3355 
   3356        case '\r':
   3357          matchLineTerminator('\n');
   3358          [[fallthrough]];
   3359        case '\n': {
   3360          // LineContinuation represents no code points.  We're manually
   3361          // consuming a LineTerminatorSequence, so we must manually
   3362          // update line/column info.
   3363          if (!updateLineInfoForEOL()) {
   3364            return false;
   3365          }
   3366 
   3367          continue;
   3368        }
   3369 
   3370        // Unicode character specification.
   3371        case 'u': {
   3372          int32_t c2 = getCodeUnit();
   3373          if (c2 == EOF) {
   3374            ReportPrematureEndOfLiteral(JSMSG_EOF_IN_ESCAPE_IN_LITERAL);
   3375            return false;
   3376          }
   3377 
   3378          // First handle a delimited Unicode escape, e.g. \u{1F4A9}.
   3379          if (c2 == '{') {
   3380            uint32_t start = this->sourceUnits.offset() - 3;
   3381            uint32_t code = 0;
   3382            bool first = true;
   3383            bool valid = true;
   3384            do {
   3385              int32_t u3 = getCodeUnit();
   3386              if (u3 == EOF) {
   3387                if (parsingTemplate) {
   3388                  TokenStreamAnyChars& anyChars = anyCharsAccess();
   3389                  anyChars.setInvalidTemplateEscape(start,
   3390                                                    InvalidEscapeType::Unicode);
   3391                  valid = false;
   3392                  break;
   3393                }
   3394                reportInvalidEscapeError(start, InvalidEscapeType::Unicode);
   3395                return false;
   3396              }
   3397              if (u3 == '}') {
   3398                if (first) {
   3399                  if (parsingTemplate) {
   3400                    TokenStreamAnyChars& anyChars = anyCharsAccess();
   3401                    anyChars.setInvalidTemplateEscape(
   3402                        start, InvalidEscapeType::Unicode);
   3403                    valid = false;
   3404                    break;
   3405                  }
   3406                  reportInvalidEscapeError(start, InvalidEscapeType::Unicode);
   3407                  return false;
   3408                }
   3409                break;
   3410              }
   3411 
   3412              // Beware: |u3| may be a non-ASCII code point here; if
   3413              // so it'll pass into this |if|-block.
   3414              if (!IsAsciiHexDigit(u3)) {
   3415                if (parsingTemplate) {
   3416                  // We put the code unit back so that we read it
   3417                  // on the next pass, which matters if it was
   3418                  // '`' or '\'.
   3419                  ungetCodeUnit(u3);
   3420 
   3421                  TokenStreamAnyChars& anyChars = anyCharsAccess();
   3422                  anyChars.setInvalidTemplateEscape(start,
   3423                                                    InvalidEscapeType::Unicode);
   3424                  valid = false;
   3425                  break;
   3426                }
   3427                reportInvalidEscapeError(start, InvalidEscapeType::Unicode);
   3428                return false;
   3429              }
   3430 
   3431              code = (code << 4) | AsciiAlphanumericToNumber(u3);
   3432              if (code > unicode::NonBMPMax) {
   3433                if (parsingTemplate) {
   3434                  TokenStreamAnyChars& anyChars = anyCharsAccess();
   3435                  anyChars.setInvalidTemplateEscape(
   3436                      start + 3, InvalidEscapeType::UnicodeOverflow);
   3437                  valid = false;
   3438                  break;
   3439                }
   3440                reportInvalidEscapeError(start + 3,
   3441                                         InvalidEscapeType::UnicodeOverflow);
   3442                return false;
   3443              }
   3444 
   3445              first = false;
   3446            } while (true);
   3447 
   3448            if (!valid) {
   3449              continue;
   3450            }
   3451 
   3452            MOZ_ASSERT(code <= unicode::NonBMPMax);
   3453            if (!AppendCodePointToCharBuffer(this->charBuffer, code)) {
   3454              return false;
   3455            }
   3456 
   3457            continue;
   3458          }  // end of delimited Unicode escape handling
   3459 
   3460          // Otherwise it must be a fixed-length \uXXXX Unicode escape.
   3461          // If it isn't, this is usually an error -- but if this is a
   3462          // template literal, we must defer error reporting because
   3463          // malformed escapes are okay in *tagged* template literals.
   3464          char16_t v;
   3465          if (IsAsciiHexDigit(c2) && this->sourceUnits.matchHexDigits(3, &v)) {
   3466            unit = (AsciiAlphanumericToNumber(c2) << 12) | v;
   3467          } else {
   3468            // Beware: |c2| may not be an ASCII code point here!
   3469            ungetCodeUnit(c2);
   3470            uint32_t start = this->sourceUnits.offset() - 2;
   3471            if (parsingTemplate) {
   3472              TokenStreamAnyChars& anyChars = anyCharsAccess();
   3473              anyChars.setInvalidTemplateEscape(start,
   3474                                                InvalidEscapeType::Unicode);
   3475              continue;
   3476            }
   3477            reportInvalidEscapeError(start, InvalidEscapeType::Unicode);
   3478            return false;
   3479          }
   3480          break;
   3481        }  // case 'u'
   3482 
   3483        // Hexadecimal character specification.
   3484        case 'x': {
   3485          char16_t v;
   3486          if (this->sourceUnits.matchHexDigits(2, &v)) {
   3487            unit = v;
   3488          } else {
   3489            uint32_t start = this->sourceUnits.offset() - 2;
   3490            if (parsingTemplate) {
   3491              TokenStreamAnyChars& anyChars = anyCharsAccess();
   3492              anyChars.setInvalidTemplateEscape(start,
   3493                                                InvalidEscapeType::Hexadecimal);
   3494              continue;
   3495            }
   3496            reportInvalidEscapeError(start, InvalidEscapeType::Hexadecimal);
   3497            return false;
   3498          }
   3499          break;
   3500        }
   3501 
   3502        default: {
   3503          if (!IsAsciiOctal(unit)) {
   3504            // \8 or \9 in an untagged template literal is a syntax error,
   3505            // reported in GeneralParser::noSubstitutionUntaggedTemplate.
   3506            //
   3507            // Tagged template literals, however, may contain \8 and \9.  The
   3508            // "cooked" representation of such a part will be |undefined|, and
   3509            // the "raw" representation will contain the literal characters.
   3510            //
   3511            //   function f(parts) {
   3512            //     assertEq(parts[0], undefined);
   3513            //     assertEq(parts.raw[0], "\\8");
   3514            //     return "composed";
   3515            //   }
   3516            //   assertEq(f`\8`, "composed");
   3517            if (unit == '8' || unit == '9') {
   3518              TokenStreamAnyChars& anyChars = anyCharsAccess();
   3519              if (parsingTemplate) {
   3520                anyChars.setInvalidTemplateEscape(
   3521                    this->sourceUnits.offset() - 2,
   3522                    InvalidEscapeType::EightOrNine);
   3523                continue;
   3524              }
   3525 
   3526              // \8 and \9 are forbidden in string literals in strict mode code.
   3527              if (!strictModeError(JSMSG_DEPRECATED_EIGHT_OR_NINE_ESCAPE)) {
   3528                return false;
   3529              }
   3530 
   3531              // The above test doesn't catch a few edge cases; see
   3532              // |GeneralParser::maybeParseDirective|.  Record the violation so
   3533              // that that function can handle them.
   3534              anyChars.setSawDeprecatedEightOrNineEscape();
   3535            }
   3536            break;
   3537          }
   3538 
   3539          // Octal character specification.
   3540          int32_t val = AsciiOctalToNumber(unit);
   3541 
   3542          unit = peekCodeUnit();
   3543          if (MOZ_UNLIKELY(unit == EOF)) {
   3544            ReportPrematureEndOfLiteral(JSMSG_EOF_IN_ESCAPE_IN_LITERAL);
   3545            return false;
   3546          }
   3547 
   3548          // Strict mode code allows only \0 followed by a non-digit.
   3549          if (val != 0 || IsAsciiDigit(unit)) {
   3550            TokenStreamAnyChars& anyChars = anyCharsAccess();
   3551            if (parsingTemplate) {
   3552              anyChars.setInvalidTemplateEscape(this->sourceUnits.offset() - 2,
   3553                                                InvalidEscapeType::Octal);
   3554              continue;
   3555            }
   3556 
   3557            if (!strictModeError(JSMSG_DEPRECATED_OCTAL_ESCAPE)) {
   3558              return false;
   3559            }
   3560 
   3561            // The above test doesn't catch a few edge cases; see
   3562            // |GeneralParser::maybeParseDirective|.  Record the violation so
   3563            // that that function can handle them.
   3564            anyChars.setSawDeprecatedOctalEscape();
   3565          }
   3566 
   3567          if (IsAsciiOctal(unit)) {
   3568            val = 8 * val + AsciiOctalToNumber(unit);
   3569            consumeKnownCodeUnit(unit);
   3570 
   3571            unit = peekCodeUnit();
   3572            if (MOZ_UNLIKELY(unit == EOF)) {
   3573              ReportPrematureEndOfLiteral(JSMSG_EOF_IN_ESCAPE_IN_LITERAL);
   3574              return false;
   3575            }
   3576 
   3577            if (IsAsciiOctal(unit)) {
   3578              int32_t save = val;
   3579              val = 8 * val + AsciiOctalToNumber(unit);
   3580              if (val <= 0xFF) {
   3581                consumeKnownCodeUnit(unit);
   3582              } else {
   3583                val = save;
   3584              }
   3585            }
   3586          }
   3587 
   3588          unit = char16_t(val);
   3589          break;
   3590        }  // default
   3591      }  // switch (AssertedCast<uint8_t>(CodeUnitValue(toUnit(unit))))
   3592 
   3593      if (!this->charBuffer.append(unit)) {
   3594        return false;
   3595      }
   3596 
   3597      continue;
   3598    }  // (unit == '\\')
   3599 
   3600    if (unit == '\r' || unit == '\n') {
   3601      if (!parsingTemplate) {
   3602        // String literals don't allow ASCII line breaks.
   3603        ungetCodeUnit(unit);
   3604        ReportPrematureEndOfLiteral(JSMSG_EOL_BEFORE_END_OF_STRING);
   3605        return false;
   3606      }
   3607 
   3608      if (unit == '\r') {
   3609        unit = '\n';
   3610        matchLineTerminator('\n');
   3611      }
   3612 
   3613      if (!updateLineInfoForEOL()) {
   3614        return false;
   3615      }
   3616 
   3617      anyCharsAccess().updateFlagsForEOL();
   3618    } else if (parsingTemplate && unit == '$' && matchCodeUnit('{')) {
   3619      templateHead = true;
   3620      break;
   3621    }
   3622 
   3623    if (!this->charBuffer.append(unit)) {
   3624      return false;
   3625    }
   3626  }
   3627 
   3628  TaggedParserAtomIndex atom = drainCharBufferIntoAtom();
   3629  if (!atom) {
   3630    return false;
   3631  }
   3632 
   3633  noteBadToken.release();
   3634 
   3635  MOZ_ASSERT_IF(!parsingTemplate, !templateHead);
   3636 
   3637  TokenKind kind = !parsingTemplate ? TokenKind::String
   3638                   : templateHead   ? TokenKind::TemplateHead
   3639                                    : TokenKind::NoSubsTemplate;
   3640  newAtomToken(kind, atom, start, modifier, out);
   3641  return true;
   3642 }
   3643 
   3644 const char* TokenKindToDesc(TokenKind tt) {
   3645  switch (tt) {
   3646 #define EMIT_CASE(name, desc) \
   3647  case TokenKind::name:       \
   3648    return desc;
   3649    FOR_EACH_TOKEN_KIND(EMIT_CASE)
   3650 #undef EMIT_CASE
   3651    case TokenKind::Limit:
   3652      MOZ_ASSERT_UNREACHABLE("TokenKind::Limit should not be passed.");
   3653      break;
   3654  }
   3655 
   3656  return "<bad TokenKind>";
   3657 }
   3658 
   3659 #ifdef DEBUG
   3660 const char* TokenKindToString(TokenKind tt) {
   3661  switch (tt) {
   3662 #  define EMIT_CASE(name, desc) \
   3663    case TokenKind::name:       \
   3664      return "TokenKind::" #name;
   3665    FOR_EACH_TOKEN_KIND(EMIT_CASE)
   3666 #  undef EMIT_CASE
   3667    case TokenKind::Limit:
   3668      break;
   3669  }
   3670 
   3671  return "<bad TokenKind>";
   3672 }
   3673 #endif
   3674 
   3675 template class TokenStreamCharsBase<Utf8Unit>;
   3676 template class TokenStreamCharsBase<char16_t>;
   3677 
   3678 template class GeneralTokenStreamChars<char16_t, TokenStreamAnyCharsAccess>;
   3679 template class TokenStreamChars<char16_t, TokenStreamAnyCharsAccess>;
   3680 template class TokenStreamSpecific<char16_t, TokenStreamAnyCharsAccess>;
   3681 
   3682 template class GeneralTokenStreamChars<
   3683    Utf8Unit, ParserAnyCharsAccess<GeneralParser<FullParseHandler, Utf8Unit>>>;
   3684 template class GeneralTokenStreamChars<
   3685    Utf8Unit,
   3686    ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, Utf8Unit>>>;
   3687 template class GeneralTokenStreamChars<
   3688    char16_t, ParserAnyCharsAccess<GeneralParser<FullParseHandler, char16_t>>>;
   3689 template class GeneralTokenStreamChars<
   3690    char16_t,
   3691    ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, char16_t>>>;
   3692 
   3693 template class TokenStreamChars<
   3694    Utf8Unit, ParserAnyCharsAccess<GeneralParser<FullParseHandler, Utf8Unit>>>;
   3695 template class TokenStreamChars<
   3696    Utf8Unit,
   3697    ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, Utf8Unit>>>;
   3698 template class TokenStreamChars<
   3699    char16_t, ParserAnyCharsAccess<GeneralParser<FullParseHandler, char16_t>>>;
   3700 template class TokenStreamChars<
   3701    char16_t,
   3702    ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, char16_t>>>;
   3703 
   3704 template class TokenStreamSpecific<
   3705    Utf8Unit, ParserAnyCharsAccess<GeneralParser<FullParseHandler, Utf8Unit>>>;
   3706 template class TokenStreamSpecific<
   3707    Utf8Unit,
   3708    ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, Utf8Unit>>>;
   3709 template class TokenStreamSpecific<
   3710    char16_t, ParserAnyCharsAccess<GeneralParser<FullParseHandler, char16_t>>>;
   3711 template class TokenStreamSpecific<
   3712    char16_t,
   3713    ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, char16_t>>>;
   3714 
   3715 }  // namespace frontend
   3716 
   3717 }  // namespace js
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE