TokenStream.cpp (123048B)
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 * vim: set ts=8 sts=2 et sw=2 tw=80: 3 * This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 // JS lexical scanner. 8 9 #include "frontend/TokenStream.h" 10 11 #include "mozilla/ArrayUtils.h" 12 #include "mozilla/Attributes.h" 13 #include "mozilla/Likely.h" 14 #include "mozilla/Maybe.h" 15 #include "mozilla/MemoryChecking.h" 16 #include "mozilla/ScopeExit.h" 17 #include "mozilla/Span.h" 18 #include "mozilla/TextUtils.h" 19 #include "mozilla/Utf8.h" 20 21 #include <algorithm> 22 #include <iterator> 23 #include <limits> 24 #include <stdarg.h> 25 #include <stdint.h> 26 #include <stdio.h> 27 #include <type_traits> 28 #include <utility> 29 30 #include "jsnum.h" 31 32 #include "frontend/FrontendContext.h" 33 #include "frontend/Parser.h" 34 #include "frontend/ParserAtom.h" 35 #include "frontend/ReservedWords.h" 36 #include "js/CharacterEncoding.h" // JS::ConstUTF8CharsZ 37 #include "js/ColumnNumber.h" // JS::LimitedColumnNumberOneOrigin, JS::ColumnNumberOneOrigin, JS::TaggedColumnNumberOneOrigin 38 #include "js/ErrorReport.h" // JSErrorBase 39 #include "js/friend/ErrorMessages.h" // js::GetErrorMessage, JSMSG_* 40 #include "js/Printf.h" // JS_smprintf 41 #include "js/RegExpFlags.h" // JS::RegExpFlags 42 #include "js/UniquePtr.h" 43 #include "util/Text.h" 44 #include "util/Unicode.h" 45 #include "vm/FrameIter.h" // js::{,NonBuiltin}FrameIter 46 #include "vm/JSContext.h" 47 #include "vm/Realm.h" 48 49 using mozilla::AsciiAlphanumericToNumber; 50 using mozilla::AssertedCast; 51 using mozilla::DecodeOneUtf8CodePoint; 52 using mozilla::IsAscii; 53 using mozilla::IsAsciiAlpha; 54 using mozilla::IsAsciiDigit; 55 using mozilla::IsAsciiHexDigit; 56 using mozilla::IsTrailingUnit; 57 using mozilla::MakeScopeExit; 58 using mozilla::Maybe; 59 using mozilla::PointerRangeSize; 60 using mozilla::Span; 61 using mozilla::Utf8Unit; 62 63 using JS::ReadOnlyCompileOptions; 64 using JS::RegExpFlag; 65 using JS::RegExpFlags; 66 67 struct ReservedWordInfo { 68 const char* chars; // C string with reserved word text 69 js::frontend::TokenKind tokentype; 70 }; 71 72 static const ReservedWordInfo reservedWords[] = { 73 #define RESERVED_WORD_INFO(word, name, type) {#word, js::frontend::type}, 74 FOR_EACH_JAVASCRIPT_RESERVED_WORD(RESERVED_WORD_INFO) 75 #undef RESERVED_WORD_INFO 76 }; 77 78 enum class ReservedWordsIndex : size_t { 79 #define ENTRY_(_1, NAME, _3) NAME, 80 FOR_EACH_JAVASCRIPT_RESERVED_WORD(ENTRY_) 81 #undef ENTRY_ 82 }; 83 84 // Returns a ReservedWordInfo for the specified characters, or nullptr if the 85 // string is not a reserved word. 86 template <typename CharT> 87 static const ReservedWordInfo* FindReservedWord(const CharT* s, size_t length) { 88 MOZ_ASSERT(length != 0); 89 90 size_t i; 91 const ReservedWordInfo* rw; 92 const char* chars; 93 94 #define JSRW_LENGTH() length 95 #define JSRW_AT(column) s[column] 96 #define JSRW_GOT_MATCH(index) \ 97 i = (index); \ 98 goto got_match; 99 #define JSRW_TEST_GUESS(index) \ 100 i = (index); \ 101 goto test_guess; 102 #define JSRW_NO_MATCH() goto no_match; 103 #include "frontend/ReservedWordsGenerated.h" 104 #undef JSRW_NO_MATCH 105 #undef JSRW_TEST_GUESS 106 #undef JSRW_GOT_MATCH 107 #undef JSRW_AT 108 #undef JSRW_LENGTH 109 110 got_match: 111 return &reservedWords[i]; 112 113 test_guess: 114 rw = &reservedWords[i]; 115 chars = rw->chars; 116 do { 117 if (*s++ != static_cast<unsigned char>(*chars++)) { 118 goto no_match; 119 } 120 } while (--length != 0); 121 return rw; 122 123 no_match: 124 return nullptr; 125 } 126 127 template <> 128 MOZ_ALWAYS_INLINE const ReservedWordInfo* FindReservedWord<Utf8Unit>( 129 const Utf8Unit* units, size_t length) { 130 return FindReservedWord(Utf8AsUnsignedChars(units), length); 131 } 132 133 static const ReservedWordInfo* FindReservedWord( 134 const js::frontend::TaggedParserAtomIndex atom) { 135 switch (atom.rawData()) { 136 #define CASE_(_1, NAME, _3) \ 137 case js::frontend::TaggedParserAtomIndex::WellKnownRawData::NAME(): \ 138 return &reservedWords[size_t(ReservedWordsIndex::NAME)]; 139 FOR_EACH_JAVASCRIPT_RESERVED_WORD(CASE_) 140 #undef CASE_ 141 } 142 143 return nullptr; 144 } 145 146 template <typename CharT> 147 static constexpr bool IsAsciiBinary(CharT c) { 148 using UnsignedCharT = std::make_unsigned_t<CharT>; 149 auto uc = static_cast<UnsignedCharT>(c); 150 return uc == '0' || uc == '1'; 151 } 152 153 template <typename CharT> 154 static constexpr bool IsAsciiOctal(CharT c) { 155 using UnsignedCharT = std::make_unsigned_t<CharT>; 156 auto uc = static_cast<UnsignedCharT>(c); 157 return '0' <= uc && uc <= '7'; 158 } 159 160 template <typename CharT> 161 static constexpr uint8_t AsciiOctalToNumber(CharT c) { 162 using UnsignedCharT = std::make_unsigned_t<CharT>; 163 auto uc = static_cast<UnsignedCharT>(c); 164 return uc - '0'; 165 } 166 167 namespace js { 168 169 namespace frontend { 170 171 bool IsKeyword(TaggedParserAtomIndex atom) { 172 if (const ReservedWordInfo* rw = FindReservedWord(atom)) { 173 return TokenKindIsKeyword(rw->tokentype); 174 } 175 176 return false; 177 } 178 179 TokenKind ReservedWordTokenKind(TaggedParserAtomIndex name) { 180 if (const ReservedWordInfo* rw = FindReservedWord(name)) { 181 return rw->tokentype; 182 } 183 184 return TokenKind::Limit; 185 } 186 187 const char* ReservedWordToCharZ(TaggedParserAtomIndex name) { 188 if (const ReservedWordInfo* rw = FindReservedWord(name)) { 189 return ReservedWordToCharZ(rw->tokentype); 190 } 191 192 return nullptr; 193 } 194 195 const char* ReservedWordToCharZ(TokenKind tt) { 196 MOZ_ASSERT(tt != TokenKind::Name); 197 switch (tt) { 198 #define EMIT_CASE(word, name, type) \ 199 case type: \ 200 return #word; 201 FOR_EACH_JAVASCRIPT_RESERVED_WORD(EMIT_CASE) 202 #undef EMIT_CASE 203 default: 204 MOZ_ASSERT_UNREACHABLE("Not a reserved word PropertyName."); 205 } 206 return nullptr; 207 } 208 209 TaggedParserAtomIndex TokenStreamAnyChars::reservedWordToPropertyName( 210 TokenKind tt) const { 211 MOZ_ASSERT(tt != TokenKind::Name); 212 switch (tt) { 213 #define EMIT_CASE(word, name, type) \ 214 case type: \ 215 return TaggedParserAtomIndex::WellKnown::name(); 216 FOR_EACH_JAVASCRIPT_RESERVED_WORD(EMIT_CASE) 217 #undef EMIT_CASE 218 default: 219 MOZ_ASSERT_UNREACHABLE("Not a reserved word TokenKind."); 220 } 221 return TaggedParserAtomIndex::null(); 222 } 223 224 SourceCoords::SourceCoords(FrontendContext* fc, uint32_t initialLineNumber, 225 uint32_t initialOffset) 226 : lineStartOffsets_(fc), initialLineNum_(initialLineNumber), lastIndex_(0) { 227 // This is actually necessary! Removing it causes compile errors on 228 // GCC and clang. You could try declaring this: 229 // 230 // const uint32_t SourceCoords::MAX_PTR; 231 // 232 // which fixes the GCC/clang error, but causes bustage on Windows. Sigh. 233 // 234 uint32_t maxPtr = MAX_PTR; 235 236 // The first line begins at buffer offset |initialOffset|. MAX_PTR is the 237 // sentinel. The appends cannot fail because |lineStartOffsets_| has 238 // statically-allocated elements. 239 MOZ_ASSERT(lineStartOffsets_.capacity() >= 2); 240 MOZ_ALWAYS_TRUE(lineStartOffsets_.reserve(2)); 241 lineStartOffsets_.infallibleAppend(initialOffset); 242 lineStartOffsets_.infallibleAppend(maxPtr); 243 } 244 245 MOZ_ALWAYS_INLINE bool SourceCoords::add(uint32_t lineNum, 246 uint32_t lineStartOffset) { 247 uint32_t index = indexFromLineNumber(lineNum); 248 uint32_t sentinelIndex = lineStartOffsets_.length() - 1; 249 250 MOZ_ASSERT(lineStartOffsets_[0] <= lineStartOffset); 251 MOZ_ASSERT(lineStartOffsets_[sentinelIndex] == MAX_PTR); 252 253 if (index == sentinelIndex) { 254 // We haven't seen this newline before. Update lineStartOffsets_ 255 // only if lineStartOffsets_.append succeeds, to keep sentinel. 256 // Otherwise return false to tell TokenStream about OOM. 257 uint32_t maxPtr = MAX_PTR; 258 if (!lineStartOffsets_.append(maxPtr)) { 259 static_assert(std::is_same_v<decltype(lineStartOffsets_.allocPolicy()), 260 TempAllocPolicy&>, 261 "this function's caller depends on it reporting an " 262 "error on failure, as TempAllocPolicy ensures"); 263 return false; 264 } 265 266 lineStartOffsets_[index] = lineStartOffset; 267 } else { 268 // We have seen this newline before (and ungot it). Do nothing (other 269 // than checking it hasn't mysteriously changed). 270 // This path can be executed after hitting OOM, so check index. 271 MOZ_ASSERT_IF(index < sentinelIndex, 272 lineStartOffsets_[index] == lineStartOffset); 273 } 274 return true; 275 } 276 277 MOZ_ALWAYS_INLINE bool SourceCoords::fill(const SourceCoords& other) { 278 MOZ_ASSERT(lineStartOffsets_[0] == other.lineStartOffsets_[0]); 279 MOZ_ASSERT(lineStartOffsets_.back() == MAX_PTR); 280 MOZ_ASSERT(other.lineStartOffsets_.back() == MAX_PTR); 281 282 if (lineStartOffsets_.length() >= other.lineStartOffsets_.length()) { 283 return true; 284 } 285 286 uint32_t sentinelIndex = lineStartOffsets_.length() - 1; 287 lineStartOffsets_[sentinelIndex] = other.lineStartOffsets_[sentinelIndex]; 288 289 for (size_t i = sentinelIndex + 1; i < other.lineStartOffsets_.length(); 290 i++) { 291 if (!lineStartOffsets_.append(other.lineStartOffsets_[i])) { 292 return false; 293 } 294 } 295 return true; 296 } 297 298 MOZ_ALWAYS_INLINE uint32_t 299 SourceCoords::indexFromOffset(uint32_t offset) const { 300 uint32_t iMin, iMax, iMid; 301 302 if (lineStartOffsets_[lastIndex_] <= offset) { 303 // If we reach here, offset is on a line the same as or higher than 304 // last time. Check first for the +0, +1, +2 cases, because they 305 // typically cover 85--98% of cases. 306 if (offset < lineStartOffsets_[lastIndex_ + 1]) { 307 return lastIndex_; // index is same as last time 308 } 309 310 // If we reach here, there must be at least one more entry (plus the 311 // sentinel). Try it. 312 lastIndex_++; 313 if (offset < lineStartOffsets_[lastIndex_ + 1]) { 314 return lastIndex_; // index is one higher than last time 315 } 316 317 // The same logic applies here. 318 lastIndex_++; 319 if (offset < lineStartOffsets_[lastIndex_ + 1]) { 320 return lastIndex_; // index is two higher than last time 321 } 322 323 // No luck. Oh well, we have a better-than-default starting point for 324 // the binary search. 325 iMin = lastIndex_ + 1; 326 MOZ_ASSERT(iMin < 327 lineStartOffsets_.length() - 1); // -1 due to the sentinel 328 329 } else { 330 iMin = 0; 331 } 332 333 // This is a binary search with deferred detection of equality, which was 334 // marginally faster in this case than a standard binary search. 335 // The -2 is because |lineStartOffsets_.length() - 1| is the sentinel, and we 336 // want one before that. 337 iMax = lineStartOffsets_.length() - 2; 338 while (iMax > iMin) { 339 iMid = iMin + (iMax - iMin) / 2; 340 if (offset >= lineStartOffsets_[iMid + 1]) { 341 iMin = iMid + 1; // offset is above lineStartOffsets_[iMid] 342 } else { 343 iMax = iMid; // offset is below or within lineStartOffsets_[iMid] 344 } 345 } 346 347 MOZ_ASSERT(iMax == iMin); 348 MOZ_ASSERT(lineStartOffsets_[iMin] <= offset); 349 MOZ_ASSERT(offset < lineStartOffsets_[iMin + 1]); 350 351 lastIndex_ = iMin; 352 return iMin; 353 } 354 355 SourceCoords::LineToken SourceCoords::lineToken(uint32_t offset) const { 356 return LineToken(indexFromOffset(offset), offset); 357 } 358 359 TokenStreamAnyChars::TokenStreamAnyChars(FrontendContext* fc, 360 const ReadOnlyCompileOptions& options, 361 StrictModeGetter* smg) 362 : fc(fc), 363 options_(options), 364 strictModeGetter_(smg), 365 filename_(options.filename()), 366 longLineColumnInfo_(fc), 367 srcCoords(fc, options.lineno, options.scriptSourceOffset), 368 lineno(options.lineno), 369 mutedErrors(options.mutedErrors()) { 370 // |isExprEnding| was initially zeroed: overwrite the true entries here. 371 isExprEnding[size_t(TokenKind::Comma)] = true; 372 isExprEnding[size_t(TokenKind::Semi)] = true; 373 isExprEnding[size_t(TokenKind::Colon)] = true; 374 isExprEnding[size_t(TokenKind::RightParen)] = true; 375 isExprEnding[size_t(TokenKind::RightBracket)] = true; 376 isExprEnding[size_t(TokenKind::RightCurly)] = true; 377 } 378 379 template <typename Unit> 380 TokenStreamCharsBase<Unit>::TokenStreamCharsBase(FrontendContext* fc, 381 ParserAtomsTable* parserAtoms, 382 const Unit* units, 383 size_t length, 384 size_t startOffset) 385 : TokenStreamCharsShared(fc, parserAtoms), 386 sourceUnits(units, length, startOffset) {} 387 388 bool FillCharBufferFromSourceNormalizingAsciiLineBreaks(CharBuffer& charBuffer, 389 const char16_t* cur, 390 const char16_t* end) { 391 MOZ_ASSERT(charBuffer.length() == 0); 392 393 while (cur < end) { 394 char16_t ch = *cur++; 395 if (ch == '\r') { 396 ch = '\n'; 397 if (cur < end && *cur == '\n') { 398 cur++; 399 } 400 } 401 402 if (!charBuffer.append(ch)) { 403 return false; 404 } 405 } 406 407 MOZ_ASSERT(cur == end); 408 return true; 409 } 410 411 bool FillCharBufferFromSourceNormalizingAsciiLineBreaks(CharBuffer& charBuffer, 412 const Utf8Unit* cur, 413 const Utf8Unit* end) { 414 MOZ_ASSERT(charBuffer.length() == 0); 415 416 while (cur < end) { 417 Utf8Unit unit = *cur++; 418 if (MOZ_LIKELY(IsAscii(unit))) { 419 char16_t ch = unit.toUint8(); 420 if (ch == '\r') { 421 ch = '\n'; 422 if (cur < end && *cur == Utf8Unit('\n')) { 423 cur++; 424 } 425 } 426 427 if (!charBuffer.append(ch)) { 428 return false; 429 } 430 431 continue; 432 } 433 434 Maybe<char32_t> ch = DecodeOneUtf8CodePoint(unit, &cur, end); 435 MOZ_ASSERT(ch.isSome(), 436 "provided source text should already have been validated"); 437 438 if (!AppendCodePointToCharBuffer(charBuffer, ch.value())) { 439 return false; 440 } 441 } 442 443 MOZ_ASSERT(cur == end); 444 return true; 445 } 446 447 template <typename Unit, class AnyCharsAccess> 448 TokenStreamSpecific<Unit, AnyCharsAccess>::TokenStreamSpecific( 449 FrontendContext* fc, ParserAtomsTable* parserAtoms, 450 const ReadOnlyCompileOptions& options, const Unit* units, size_t length) 451 : TokenStreamChars<Unit, AnyCharsAccess>(fc, parserAtoms, units, length, 452 options.scriptSourceOffset) {} 453 454 bool TokenStreamAnyChars::checkOptions() { 455 // Constrain starting columns to where they will saturate. 456 if (options().column.oneOriginValue() > 457 JS::LimitedColumnNumberOneOrigin::Limit) { 458 reportErrorNoOffset(JSMSG_BAD_COLUMN_NUMBER); 459 return false; 460 } 461 462 return true; 463 } 464 465 void TokenStreamAnyChars::reportErrorNoOffset(unsigned errorNumber, ...) const { 466 va_list args; 467 va_start(args, errorNumber); 468 469 reportErrorNoOffsetVA(errorNumber, &args); 470 471 va_end(args); 472 } 473 474 void TokenStreamAnyChars::reportErrorNoOffsetVA(unsigned errorNumber, 475 va_list* args) const { 476 ErrorMetadata metadata; 477 computeErrorMetadataNoOffset(&metadata); 478 479 ReportCompileErrorLatin1VA(fc, std::move(metadata), nullptr, errorNumber, 480 args); 481 } 482 483 [[nodiscard]] MOZ_ALWAYS_INLINE bool 484 TokenStreamAnyChars::internalUpdateLineInfoForEOL(uint32_t lineStartOffset) { 485 prevLinebase = linebase; 486 linebase = lineStartOffset; 487 lineno++; 488 489 // On overflow, report error. 490 if (MOZ_UNLIKELY(!lineno)) { 491 reportErrorNoOffset(JSMSG_BAD_LINE_NUMBER); 492 return false; 493 } 494 495 return srcCoords.add(lineno, linebase); 496 } 497 498 #ifdef DEBUG 499 500 template <> 501 inline void SourceUnits<char16_t>::assertNextCodePoint( 502 const PeekedCodePoint<char16_t>& peeked) { 503 char32_t c = peeked.codePoint(); 504 if (c < unicode::NonBMPMin) { 505 MOZ_ASSERT(peeked.lengthInUnits() == 1); 506 MOZ_ASSERT(ptr[0] == c); 507 } else { 508 MOZ_ASSERT(peeked.lengthInUnits() == 2); 509 char16_t lead, trail; 510 unicode::UTF16Encode(c, &lead, &trail); 511 MOZ_ASSERT(ptr[0] == lead); 512 MOZ_ASSERT(ptr[1] == trail); 513 } 514 } 515 516 template <> 517 inline void SourceUnits<Utf8Unit>::assertNextCodePoint( 518 const PeekedCodePoint<Utf8Unit>& peeked) { 519 char32_t c = peeked.codePoint(); 520 521 // This is all roughly indulgence of paranoia only for assertions, so the 522 // reimplementation of UTF-8 encoding a code point is (we think) a virtue. 523 uint8_t expectedUnits[4] = {}; 524 if (c < 0x80) { 525 expectedUnits[0] = AssertedCast<uint8_t>(c); 526 } else if (c < 0x800) { 527 expectedUnits[0] = 0b1100'0000 | (c >> 6); 528 expectedUnits[1] = 0b1000'0000 | (c & 0b11'1111); 529 } else if (c < 0x10000) { 530 expectedUnits[0] = 0b1110'0000 | (c >> 12); 531 expectedUnits[1] = 0b1000'0000 | ((c >> 6) & 0b11'1111); 532 expectedUnits[2] = 0b1000'0000 | (c & 0b11'1111); 533 } else { 534 expectedUnits[0] = 0b1111'0000 | (c >> 18); 535 expectedUnits[1] = 0b1000'0000 | ((c >> 12) & 0b11'1111); 536 expectedUnits[2] = 0b1000'0000 | ((c >> 6) & 0b11'1111); 537 expectedUnits[3] = 0b1000'0000 | (c & 0b11'1111); 538 } 539 540 MOZ_ASSERT(peeked.lengthInUnits() <= 4); 541 for (uint8_t i = 0; i < peeked.lengthInUnits(); i++) { 542 MOZ_ASSERT(expectedUnits[i] == ptr[i].toUint8()); 543 } 544 } 545 546 #endif // DEBUG 547 548 static MOZ_ALWAYS_INLINE void RetractPointerToCodePointBoundary( 549 const Utf8Unit** ptr, const Utf8Unit* limit) { 550 MOZ_ASSERT(*ptr <= limit); 551 552 // |limit| is a code point boundary. 553 if (MOZ_UNLIKELY(*ptr == limit)) { 554 return; 555 } 556 557 // Otherwise rewind past trailing units to the start of the code point. 558 #ifdef DEBUG 559 size_t retracted = 0; 560 #endif 561 while (MOZ_UNLIKELY(IsTrailingUnit((*ptr)[0]))) { 562 --*ptr; 563 #ifdef DEBUG 564 retracted++; 565 #endif 566 } 567 568 MOZ_ASSERT(retracted < 4, 569 "the longest UTF-8 code point is four units, so this should never " 570 "retract more than three units"); 571 } 572 573 static MOZ_ALWAYS_INLINE void RetractPointerToCodePointBoundary( 574 const char16_t** ptr, const char16_t* limit) { 575 MOZ_ASSERT(*ptr <= limit); 576 577 // |limit| is a code point boundary. 578 if (MOZ_UNLIKELY(*ptr == limit)) { 579 return; 580 } 581 582 // Otherwise the pointer must be retracted by one iff it splits a two-unit 583 // code point. 584 if (MOZ_UNLIKELY(unicode::IsTrailSurrogate((*ptr)[0]))) { 585 // Outside test suites testing garbage WTF-16, it's basically guaranteed 586 // here that |(*ptr)[-1] (*ptr)[0]| is a surrogate pair. 587 if (MOZ_LIKELY(unicode::IsLeadSurrogate((*ptr)[-1]))) { 588 --*ptr; 589 } 590 } 591 } 592 593 template <typename Unit> 594 JS::ColumnNumberUnsignedOffset TokenStreamAnyChars::computeColumnOffset( 595 const LineToken lineToken, const uint32_t offset, 596 const SourceUnits<Unit>& sourceUnits) const { 597 lineToken.assertConsistentOffset(offset); 598 599 const uint32_t start = srcCoords.lineStart(lineToken); 600 const uint32_t offsetInLine = offset - start; 601 602 if constexpr (std::is_same_v<Unit, char16_t>) { 603 // Column offset is in UTF-16 code units. 604 return JS::ColumnNumberUnsignedOffset(offsetInLine); 605 } 606 607 return computeColumnOffsetForUTF8(lineToken, offset, start, offsetInLine, 608 sourceUnits); 609 } 610 611 template <typename Unit> 612 JS::ColumnNumberUnsignedOffset TokenStreamAnyChars::computeColumnOffsetForUTF8( 613 const LineToken lineToken, const uint32_t offset, const uint32_t start, 614 const uint32_t offsetInLine, const SourceUnits<Unit>& sourceUnits) const { 615 const uint32_t line = lineNumber(lineToken); 616 617 // Reset the previous offset/column number offset cache for this line, if the 618 // previous lookup wasn't on this line. 619 if (line != lineOfLastColumnComputation_) { 620 lineOfLastColumnComputation_ = line; 621 lastChunkVectorForLine_ = nullptr; 622 lastOffsetOfComputedColumn_ = start; 623 lastComputedColumnOffset_ = JS::ColumnNumberUnsignedOffset::zero(); 624 } 625 626 // Compute and return the final column number offset from a partially 627 // calculated offset/column number offset, using the last-cached 628 // offset/column number offset if they're more optimal. 629 auto OffsetFromPartial = 630 [this, offset, &sourceUnits]( 631 uint32_t partialOffset, 632 JS::ColumnNumberUnsignedOffset partialColumnOffset, 633 UnitsType unitsType) { 634 MOZ_ASSERT(partialOffset <= offset); 635 636 // If the last lookup on this line was closer to |offset|, use it. 637 if (partialOffset < this->lastOffsetOfComputedColumn_ && 638 this->lastOffsetOfComputedColumn_ <= offset) { 639 partialOffset = this->lastOffsetOfComputedColumn_; 640 partialColumnOffset = this->lastComputedColumnOffset_; 641 } 642 643 const Unit* begin = sourceUnits.codeUnitPtrAt(partialOffset); 644 const Unit* end = sourceUnits.codeUnitPtrAt(offset); 645 646 size_t offsetDelta = 647 AssertedCast<uint32_t>(PointerRangeSize(begin, end)); 648 partialOffset += offsetDelta; 649 650 if (unitsType == UnitsType::GuaranteedSingleUnit) { 651 MOZ_ASSERT(unicode::CountUTF16CodeUnits(begin, end) == offsetDelta, 652 "guaranteed-single-units also guarantee pointer distance " 653 "equals UTF-16 code unit count"); 654 partialColumnOffset += JS::ColumnNumberUnsignedOffset(offsetDelta); 655 } else { 656 partialColumnOffset += JS::ColumnNumberUnsignedOffset( 657 AssertedCast<uint32_t>(unicode::CountUTF16CodeUnits(begin, end))); 658 } 659 660 this->lastOffsetOfComputedColumn_ = partialOffset; 661 this->lastComputedColumnOffset_ = partialColumnOffset; 662 return partialColumnOffset; 663 }; 664 665 // We won't add an entry to |longLineColumnInfo_| for lines where the maximum 666 // column has offset less than this value. The most common (non-minified) 667 // long line length is likely 80ch, maybe 100ch, so we use that, rounded up to 668 // the next power of two for efficient division/multiplication below. 669 constexpr uint32_t ColumnChunkLength = mozilla::RoundUpPow2(100); 670 671 // The index within any associated |Vector<ChunkInfo>| of |offset|'s chunk. 672 const uint32_t chunkIndex = offsetInLine / ColumnChunkLength; 673 if (chunkIndex == 0) { 674 // We don't know from an |offset| in the zeroth chunk that this line is even 675 // long. First-chunk info is mostly useless, anyway -- we have |start| 676 // already. So if we have *easy* access to that zeroth chunk, use it -- 677 // otherwise just count pessimally. (This will still benefit from caching 678 // the last column/offset for computations for successive offsets, so it's 679 // not *always* worst-case.) 680 UnitsType unitsType; 681 if (lastChunkVectorForLine_ && lastChunkVectorForLine_->length() > 0) { 682 MOZ_ASSERT((*lastChunkVectorForLine_)[0].columnOffset() == 683 JS::ColumnNumberUnsignedOffset::zero()); 684 unitsType = (*lastChunkVectorForLine_)[0].unitsType(); 685 } else { 686 unitsType = UnitsType::PossiblyMultiUnit; 687 } 688 689 return OffsetFromPartial(start, JS::ColumnNumberUnsignedOffset::zero(), 690 unitsType); 691 } 692 693 // If this line has no chunk vector yet, insert one in the hash map. (The 694 // required index is allocated and filled further down.) 695 if (!lastChunkVectorForLine_) { 696 auto ptr = longLineColumnInfo_.lookupForAdd(line); 697 if (!ptr) { 698 // This could rehash and invalidate a cached vector pointer, but the outer 699 // condition means we don't have a cached pointer. 700 if (!longLineColumnInfo_.add(ptr, line, Vector<ChunkInfo>(fc))) { 701 // In case of OOM, just count columns from the start of the line. 702 fc->recoverFromOutOfMemory(); 703 return OffsetFromPartial(start, JS::ColumnNumberUnsignedOffset::zero(), 704 UnitsType::PossiblyMultiUnit); 705 } 706 } 707 708 // Note that adding elements to this vector won't invalidate this pointer. 709 lastChunkVectorForLine_ = &ptr->value(); 710 } 711 712 const Unit* const limit = sourceUnits.codeUnitPtrAt(offset); 713 714 auto RetractedOffsetOfChunk = [ 715 #ifdef DEBUG 716 this, 717 #endif 718 start, limit, 719 &sourceUnits](uint32_t index) { 720 MOZ_ASSERT(index < this->lastChunkVectorForLine_->length()); 721 722 uint32_t naiveOffset = start + index * ColumnChunkLength; 723 const Unit* naivePtr = sourceUnits.codeUnitPtrAt(naiveOffset); 724 725 const Unit* actualPtr = naivePtr; 726 RetractPointerToCodePointBoundary(&actualPtr, limit); 727 728 #ifdef DEBUG 729 if ((*this->lastChunkVectorForLine_)[index].unitsType() == 730 UnitsType::GuaranteedSingleUnit) { 731 MOZ_ASSERT(naivePtr == actualPtr, "miscomputed unitsType value"); 732 } 733 #endif 734 735 return naiveOffset - PointerRangeSize(actualPtr, naivePtr); 736 }; 737 738 uint32_t partialOffset; 739 JS::ColumnNumberUnsignedOffset partialColumnOffset; 740 UnitsType unitsType; 741 742 auto entriesLen = AssertedCast<uint32_t>(lastChunkVectorForLine_->length()); 743 if (chunkIndex < entriesLen) { 744 // We've computed the chunk |offset| resides in. Compute the column number 745 // from the chunk. 746 partialOffset = RetractedOffsetOfChunk(chunkIndex); 747 partialColumnOffset = (*lastChunkVectorForLine_)[chunkIndex].columnOffset(); 748 749 // This is exact if |chunkIndex| isn't the last chunk. 750 unitsType = (*lastChunkVectorForLine_)[chunkIndex].unitsType(); 751 752 // Otherwise the last chunk is pessimistically assumed to contain multi-unit 753 // code points because we haven't fully examined its contents yet -- they 754 // may not have been tokenized yet, they could contain encoding errors, or 755 // they might not even exist. 756 MOZ_ASSERT_IF(chunkIndex == entriesLen - 1, 757 (*lastChunkVectorForLine_)[chunkIndex].unitsType() == 758 UnitsType::PossiblyMultiUnit); 759 } else { 760 // Extend the vector from its last entry or the start of the line. (This is 761 // also a suitable partial start point if we must recover from OOM.) 762 if (entriesLen > 0) { 763 partialOffset = RetractedOffsetOfChunk(entriesLen - 1); 764 partialColumnOffset = 765 (*lastChunkVectorForLine_)[entriesLen - 1].columnOffset(); 766 } else { 767 partialOffset = start; 768 partialColumnOffset = JS::ColumnNumberUnsignedOffset::zero(); 769 } 770 771 if (!lastChunkVectorForLine_->reserve(chunkIndex + 1)) { 772 // As earlier, just start from the greatest offset/column in case of OOM. 773 fc->recoverFromOutOfMemory(); 774 return OffsetFromPartial(partialOffset, partialColumnOffset, 775 UnitsType::PossiblyMultiUnit); 776 } 777 778 // OOM is no longer possible now. \o/ 779 780 // The vector always begins with the column of the line start, i.e. zero, 781 // with chunk units pessimally assumed not single-unit. 782 if (entriesLen == 0) { 783 lastChunkVectorForLine_->infallibleAppend( 784 ChunkInfo(JS::ColumnNumberUnsignedOffset::zero(), 785 UnitsType::PossiblyMultiUnit)); 786 entriesLen++; 787 } 788 789 do { 790 const Unit* const begin = sourceUnits.codeUnitPtrAt(partialOffset); 791 const Unit* chunkLimit = sourceUnits.codeUnitPtrAt( 792 start + std::min(entriesLen++ * ColumnChunkLength, offsetInLine)); 793 794 MOZ_ASSERT(begin < chunkLimit); 795 MOZ_ASSERT(chunkLimit <= limit); 796 797 static_assert( 798 ColumnChunkLength > SourceUnitTraits<Unit>::maxUnitsLength - 1, 799 "any retraction below is assumed to never underflow to the " 800 "preceding chunk, even for the longest code point"); 801 802 // Prior tokenizing ensured that [begin, limit) is validly encoded, and 803 // |begin < chunkLimit|, so any retraction here can't underflow. 804 RetractPointerToCodePointBoundary(&chunkLimit, limit); 805 806 MOZ_ASSERT(begin < chunkLimit); 807 MOZ_ASSERT(chunkLimit <= limit); 808 809 size_t numUnits = PointerRangeSize(begin, chunkLimit); 810 size_t numUTF16CodeUnits = 811 unicode::CountUTF16CodeUnits(begin, chunkLimit); 812 813 // If this chunk (which will become non-final at the end of the loop) is 814 // all single-unit code points, annotate the chunk accordingly. 815 if (numUnits == numUTF16CodeUnits) { 816 lastChunkVectorForLine_->back().guaranteeSingleUnits(); 817 } 818 819 partialOffset += numUnits; 820 partialColumnOffset += JS::ColumnNumberUnsignedOffset(numUTF16CodeUnits); 821 822 lastChunkVectorForLine_->infallibleEmplaceBack( 823 partialColumnOffset, UnitsType::PossiblyMultiUnit); 824 } while (entriesLen < chunkIndex + 1); 825 826 // We're at a spot in the current final chunk, and final chunks never have 827 // complete units information, so be pessimistic. 828 unitsType = UnitsType::PossiblyMultiUnit; 829 } 830 831 return OffsetFromPartial(partialOffset, partialColumnOffset, unitsType); 832 } 833 834 template <typename Unit, class AnyCharsAccess> 835 JS::LimitedColumnNumberOneOrigin 836 GeneralTokenStreamChars<Unit, AnyCharsAccess>::computeColumn( 837 LineToken lineToken, uint32_t offset) const { 838 lineToken.assertConsistentOffset(offset); 839 840 const TokenStreamAnyChars& anyChars = anyCharsAccess(); 841 842 JS::ColumnNumberUnsignedOffset columnOffset = 843 anyChars.computeColumnOffset(lineToken, offset, this->sourceUnits); 844 845 if (!lineToken.isFirstLine()) { 846 return JS::LimitedColumnNumberOneOrigin::fromUnlimited( 847 JS::ColumnNumberOneOrigin() + columnOffset); 848 } 849 850 if (1 + columnOffset.value() > JS::LimitedColumnNumberOneOrigin::Limit) { 851 return JS::LimitedColumnNumberOneOrigin::limit(); 852 } 853 854 return JS::LimitedColumnNumberOneOrigin::fromUnlimited( 855 (anyChars.options_.column + columnOffset).oneOriginValue()); 856 } 857 858 template <typename Unit, class AnyCharsAccess> 859 void GeneralTokenStreamChars<Unit, AnyCharsAccess>::computeLineAndColumn( 860 uint32_t offset, uint32_t* line, 861 JS::LimitedColumnNumberOneOrigin* column) const { 862 const TokenStreamAnyChars& anyChars = anyCharsAccess(); 863 864 auto lineToken = anyChars.lineToken(offset); 865 *line = anyChars.lineNumber(lineToken); 866 *column = computeColumn(lineToken, offset); 867 } 868 869 template <class AnyCharsAccess> 870 MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::internalEncodingError( 871 uint8_t relevantUnits, unsigned errorNumber, ...) { 872 va_list args; 873 va_start(args, errorNumber); 874 875 do { 876 size_t offset = this->sourceUnits.offset(); 877 878 ErrorMetadata err; 879 880 TokenStreamAnyChars& anyChars = anyCharsAccess(); 881 882 bool canAddLineOfContext = fillExceptingContext(&err, offset); 883 if (canAddLineOfContext) { 884 if (!internalComputeLineOfContext(&err, offset)) { 885 break; 886 } 887 888 // As this is an encoding error, the computed window-end must be 889 // identical to the location of the error -- any further on and the 890 // window would contain invalid Unicode. 891 MOZ_ASSERT_IF(err.lineOfContext != nullptr, 892 err.lineLength == err.tokenOffset); 893 } 894 895 auto notes = MakeUnique<JSErrorNotes>(); 896 if (!notes) { 897 ReportOutOfMemory(anyChars.fc); 898 break; 899 } 900 901 // The largest encoding of a UTF-8 code point is 4 units. (Encoding an 902 // obsolete 5- or 6-byte code point will complain only about a bad lead 903 // code unit.) 904 constexpr size_t MaxWidth = sizeof("0xHH 0xHH 0xHH 0xHH"); 905 906 MOZ_ASSERT(relevantUnits > 0); 907 908 char badUnitsStr[MaxWidth]; 909 char* ptr = badUnitsStr; 910 while (relevantUnits > 0) { 911 byteToString(this->sourceUnits.getCodeUnit().toUint8(), ptr); 912 ptr[4] = ' '; 913 914 ptr += 5; 915 relevantUnits--; 916 } 917 918 ptr[-1] = '\0'; 919 920 uint32_t line; 921 JS::LimitedColumnNumberOneOrigin column; 922 computeLineAndColumn(offset, &line, &column); 923 924 if (!notes->addNoteASCII(anyChars.fc, anyChars.getFilename().c_str(), 0, 925 line, JS::ColumnNumberOneOrigin(column), 926 GetErrorMessage, nullptr, JSMSG_BAD_CODE_UNITS, 927 badUnitsStr)) { 928 break; 929 } 930 931 ReportCompileErrorLatin1VA(anyChars.fc, std::move(err), std::move(notes), 932 errorNumber, &args); 933 } while (false); 934 935 va_end(args); 936 } 937 938 template <class AnyCharsAccess> 939 MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::badLeadUnit( 940 Utf8Unit lead) { 941 uint8_t leadValue = lead.toUint8(); 942 943 char leadByteStr[5]; 944 byteToTerminatedString(leadValue, leadByteStr); 945 946 internalEncodingError(1, JSMSG_BAD_LEADING_UTF8_UNIT, leadByteStr); 947 } 948 949 template <class AnyCharsAccess> 950 MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::notEnoughUnits( 951 Utf8Unit lead, uint8_t remaining, uint8_t required) { 952 uint8_t leadValue = lead.toUint8(); 953 954 MOZ_ASSERT(required == 2 || required == 3 || required == 4); 955 MOZ_ASSERT(remaining < 4); 956 MOZ_ASSERT(remaining < required); 957 958 char leadByteStr[5]; 959 byteToTerminatedString(leadValue, leadByteStr); 960 961 // |toHexChar| produces the desired decimal numbers for values < 4. 962 const char expectedStr[] = {toHexChar(required - 1), '\0'}; 963 const char actualStr[] = {toHexChar(remaining - 1), '\0'}; 964 965 internalEncodingError(remaining, JSMSG_NOT_ENOUGH_CODE_UNITS, leadByteStr, 966 expectedStr, required == 2 ? "" : "s", actualStr, 967 remaining == 2 ? " was" : "s were"); 968 } 969 970 template <class AnyCharsAccess> 971 MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::badTrailingUnit( 972 uint8_t unitsObserved) { 973 Utf8Unit badUnit = 974 this->sourceUnits.addressOfNextCodeUnit()[unitsObserved - 1]; 975 976 char badByteStr[5]; 977 byteToTerminatedString(badUnit.toUint8(), badByteStr); 978 979 internalEncodingError(unitsObserved, JSMSG_BAD_TRAILING_UTF8_UNIT, 980 badByteStr); 981 } 982 983 template <class AnyCharsAccess> 984 MOZ_COLD void 985 TokenStreamChars<Utf8Unit, AnyCharsAccess>::badStructurallyValidCodePoint( 986 char32_t codePoint, uint8_t codePointLength, const char* reason) { 987 // Construct a string like "0x203D" (including null terminator) to include 988 // in the error message. Write the string end-to-start from end to start 989 // of an adequately sized |char| array, shifting least significant nibbles 990 // off the number and writing the corresponding hex digits until done, then 991 // prefixing with "0x". |codePointStr| points at the incrementally 992 // computed string, within |codePointCharsArray|'s bounds. 993 994 // 0x1F'FFFF is the maximum value that can fit in 3+6+6+6 unconstrained 995 // bits in a four-byte UTF-8 code unit sequence. 996 constexpr size_t MaxHexSize = sizeof( 997 "0x1F" 998 "FFFF"); // including '\0' 999 char codePointCharsArray[MaxHexSize]; 1000 1001 char* codePointStr = std::end(codePointCharsArray); 1002 *--codePointStr = '\0'; 1003 1004 // Note that by do-while looping here rather than while-looping, this 1005 // writes a '0' when |codePoint == 0|. 1006 do { 1007 MOZ_ASSERT(codePointCharsArray < codePointStr); 1008 *--codePointStr = toHexChar(codePoint & 0xF); 1009 codePoint >>= 4; 1010 } while (codePoint); 1011 1012 MOZ_ASSERT(codePointCharsArray + 2 <= codePointStr); 1013 *--codePointStr = 'x'; 1014 *--codePointStr = '0'; 1015 1016 internalEncodingError(codePointLength, JSMSG_FORBIDDEN_UTF8_CODE_POINT, 1017 codePointStr, reason); 1018 } 1019 1020 template <class AnyCharsAccess> 1021 [[nodiscard]] bool 1022 TokenStreamChars<Utf8Unit, AnyCharsAccess>::getNonAsciiCodePointDontNormalize( 1023 Utf8Unit lead, char32_t* codePoint) { 1024 auto onBadLeadUnit = [this, &lead]() { this->badLeadUnit(lead); }; 1025 1026 auto onNotEnoughUnits = [this, &lead](uint8_t remaining, uint8_t required) { 1027 this->notEnoughUnits(lead, remaining, required); 1028 }; 1029 1030 auto onBadTrailingUnit = [this](uint8_t unitsObserved) { 1031 this->badTrailingUnit(unitsObserved); 1032 }; 1033 1034 auto onBadCodePoint = [this](char32_t badCodePoint, uint8_t unitsObserved) { 1035 this->badCodePoint(badCodePoint, unitsObserved); 1036 }; 1037 1038 auto onNotShortestForm = [this](char32_t badCodePoint, 1039 uint8_t unitsObserved) { 1040 this->notShortestForm(badCodePoint, unitsObserved); 1041 }; 1042 1043 // If a valid code point is decoded, this function call consumes its code 1044 // units. If not, it ungets the lead code unit and invokes the right error 1045 // handler, so on failure we must immediately return false. 1046 SourceUnitsIterator iter(this->sourceUnits); 1047 Maybe<char32_t> maybeCodePoint = DecodeOneUtf8CodePointInline( 1048 lead, &iter, SourceUnitsEnd(), onBadLeadUnit, onNotEnoughUnits, 1049 onBadTrailingUnit, onBadCodePoint, onNotShortestForm); 1050 if (maybeCodePoint.isNothing()) { 1051 return false; 1052 } 1053 1054 *codePoint = maybeCodePoint.value(); 1055 return true; 1056 } 1057 1058 template <class AnyCharsAccess> 1059 bool TokenStreamChars<char16_t, AnyCharsAccess>::getNonAsciiCodePoint( 1060 int32_t lead, char32_t* codePoint) { 1061 MOZ_ASSERT(lead != EOF); 1062 MOZ_ASSERT(!isAsciiCodePoint(lead), 1063 "ASCII code unit/point must be handled separately"); 1064 MOZ_ASSERT(lead == this->sourceUnits.previousCodeUnit(), 1065 "getNonAsciiCodePoint called incorrectly"); 1066 1067 // The code point is usually |lead|: overwrite later if needed. 1068 *codePoint = AssertedCast<char32_t>(lead); 1069 1070 // ECMAScript specifically requires that unpaired UTF-16 surrogates be 1071 // treated as the corresponding code point and not as an error. See 1072 // <https://tc39.github.io/ecma262/#sec-ecmascript-language-types-string-type>. 1073 // Thus this function does not consider any sequence of 16-bit numbers to 1074 // be intrinsically in error. 1075 1076 // Dispense with single-unit code points and lone trailing surrogates. 1077 if (MOZ_LIKELY(!unicode::IsLeadSurrogate(lead))) { 1078 if (MOZ_UNLIKELY(lead == unicode::LINE_SEPARATOR || 1079 lead == unicode::PARA_SEPARATOR)) { 1080 if (!updateLineInfoForEOL()) { 1081 #ifdef DEBUG 1082 // Assign to a sentinel value to hopefully cause errors. 1083 *codePoint = std::numeric_limits<char32_t>::max(); 1084 #endif 1085 MOZ_MAKE_MEM_UNDEFINED(codePoint, sizeof(*codePoint)); 1086 return false; 1087 } 1088 1089 *codePoint = '\n'; 1090 } else { 1091 MOZ_ASSERT(!IsLineTerminator(*codePoint)); 1092 } 1093 1094 return true; 1095 } 1096 1097 // Also handle a lead surrogate not paired with a trailing surrogate. 1098 if (MOZ_UNLIKELY( 1099 this->sourceUnits.atEnd() || 1100 !unicode::IsTrailSurrogate(this->sourceUnits.peekCodeUnit()))) { 1101 MOZ_ASSERT(!IsLineTerminator(*codePoint)); 1102 return true; 1103 } 1104 1105 // Otherwise we have a multi-unit code point. 1106 *codePoint = unicode::UTF16Decode(lead, this->sourceUnits.getCodeUnit()); 1107 MOZ_ASSERT(!IsLineTerminator(*codePoint)); 1108 return true; 1109 } 1110 1111 template <class AnyCharsAccess> 1112 bool TokenStreamChars<Utf8Unit, AnyCharsAccess>::getNonAsciiCodePoint( 1113 int32_t unit, char32_t* codePoint) { 1114 MOZ_ASSERT(unit != EOF); 1115 MOZ_ASSERT(!isAsciiCodePoint(unit), 1116 "ASCII code unit/point must be handled separately"); 1117 1118 Utf8Unit lead = Utf8Unit(static_cast<unsigned char>(unit)); 1119 MOZ_ASSERT(lead == this->sourceUnits.previousCodeUnit(), 1120 "getNonAsciiCodePoint called incorrectly"); 1121 1122 auto onBadLeadUnit = [this, &lead]() { this->badLeadUnit(lead); }; 1123 1124 auto onNotEnoughUnits = [this, &lead](uint_fast8_t remaining, 1125 uint_fast8_t required) { 1126 this->notEnoughUnits(lead, remaining, required); 1127 }; 1128 1129 auto onBadTrailingUnit = [this](uint_fast8_t unitsObserved) { 1130 this->badTrailingUnit(unitsObserved); 1131 }; 1132 1133 auto onBadCodePoint = [this](char32_t badCodePoint, 1134 uint_fast8_t unitsObserved) { 1135 this->badCodePoint(badCodePoint, unitsObserved); 1136 }; 1137 1138 auto onNotShortestForm = [this](char32_t badCodePoint, 1139 uint_fast8_t unitsObserved) { 1140 this->notShortestForm(badCodePoint, unitsObserved); 1141 }; 1142 1143 // This consumes the full, valid code point or ungets |lead| and calls the 1144 // appropriate error functor on failure. 1145 SourceUnitsIterator iter(this->sourceUnits); 1146 Maybe<char32_t> maybeCodePoint = DecodeOneUtf8CodePoint( 1147 lead, &iter, SourceUnitsEnd(), onBadLeadUnit, onNotEnoughUnits, 1148 onBadTrailingUnit, onBadCodePoint, onNotShortestForm); 1149 if (maybeCodePoint.isNothing()) { 1150 return false; 1151 } 1152 1153 char32_t cp = maybeCodePoint.value(); 1154 if (MOZ_UNLIKELY(cp == unicode::LINE_SEPARATOR || 1155 cp == unicode::PARA_SEPARATOR)) { 1156 if (!updateLineInfoForEOL()) { 1157 #ifdef DEBUG 1158 // Assign to a sentinel value to hopefully cause errors. 1159 *codePoint = std::numeric_limits<char32_t>::max(); 1160 #endif 1161 MOZ_MAKE_MEM_UNDEFINED(codePoint, sizeof(*codePoint)); 1162 return false; 1163 } 1164 1165 *codePoint = '\n'; 1166 } else { 1167 MOZ_ASSERT(!IsLineTerminator(cp)); 1168 *codePoint = cp; 1169 } 1170 1171 return true; 1172 } 1173 1174 template <> 1175 size_t SourceUnits<char16_t>::findWindowStart(size_t offset) const { 1176 // This is JS's understanding of UTF-16 that allows lone surrogates, so 1177 // we have to exclude lone surrogates from [windowStart, offset) ourselves. 1178 1179 const char16_t* const earliestPossibleStart = codeUnitPtrAt(startOffset_); 1180 1181 const char16_t* const initial = codeUnitPtrAt(offset); 1182 const char16_t* p = initial; 1183 1184 auto HalfWindowSize = [&p, &initial]() { 1185 return PointerRangeSize(p, initial); 1186 }; 1187 1188 while (true) { 1189 MOZ_ASSERT(earliestPossibleStart <= p); 1190 MOZ_ASSERT(HalfWindowSize() <= WindowRadius); 1191 if (p <= earliestPossibleStart || HalfWindowSize() >= WindowRadius) { 1192 break; 1193 } 1194 1195 char16_t c = p[-1]; 1196 1197 // This stops at U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR in 1198 // string and template literals. These code points do affect line and 1199 // column coordinates, even as they encode their literal values. 1200 if (IsLineTerminator(c)) { 1201 break; 1202 } 1203 1204 // Don't allow invalid UTF-16 in pre-context. (Current users don't 1205 // require this, and this behavior isn't currently imposed on 1206 // pre-context, but these facts might change someday.) 1207 1208 if (MOZ_UNLIKELY(unicode::IsLeadSurrogate(c))) { 1209 break; 1210 } 1211 1212 // Optimistically include the code unit, reverting below if needed. 1213 p--; 1214 1215 // If it's not a surrogate at all, keep going. 1216 if (MOZ_LIKELY(!unicode::IsTrailSurrogate(c))) { 1217 continue; 1218 } 1219 1220 // Stop if we don't have a usable surrogate pair. 1221 if (HalfWindowSize() >= WindowRadius || 1222 p <= earliestPossibleStart || // trail surrogate at low end 1223 !unicode::IsLeadSurrogate(p[-1])) // no paired lead surrogate 1224 { 1225 p++; 1226 break; 1227 } 1228 1229 p--; 1230 } 1231 1232 MOZ_ASSERT(HalfWindowSize() <= WindowRadius); 1233 return offset - HalfWindowSize(); 1234 } 1235 1236 template <> 1237 size_t SourceUnits<Utf8Unit>::findWindowStart(size_t offset) const { 1238 // |offset| must be the location of the error or somewhere before it, so we 1239 // know preceding data is valid UTF-8. 1240 1241 const Utf8Unit* const earliestPossibleStart = codeUnitPtrAt(startOffset_); 1242 1243 const Utf8Unit* const initial = codeUnitPtrAt(offset); 1244 const Utf8Unit* p = initial; 1245 1246 auto HalfWindowSize = [&p, &initial]() { 1247 return PointerRangeSize(p, initial); 1248 }; 1249 1250 while (true) { 1251 MOZ_ASSERT(earliestPossibleStart <= p); 1252 MOZ_ASSERT(HalfWindowSize() <= WindowRadius); 1253 if (p <= earliestPossibleStart || HalfWindowSize() >= WindowRadius) { 1254 break; 1255 } 1256 1257 // Peek backward for a line break, and only decrement if there is none. 1258 uint8_t prev = p[-1].toUint8(); 1259 1260 // First check for the ASCII LineTerminators. 1261 if (prev == '\r' || prev == '\n') { 1262 break; 1263 } 1264 1265 // Now check for the non-ASCII LineTerminators U+2028 LINE SEPARATOR 1266 // (0xE2 0x80 0xA8) and U+2029 PARAGRAPH (0xE2 0x80 0xA9). If there 1267 // aren't three code units available, some comparison here will fail 1268 // before we'd underflow. 1269 if (MOZ_UNLIKELY((prev == 0xA8 || prev == 0xA9) && 1270 p[-2].toUint8() == 0x80 && p[-3].toUint8() == 0xE2)) { 1271 break; 1272 } 1273 1274 // Rewind over the non-LineTerminator. This can't underflow 1275 // |earliestPossibleStart| because it begins a code point. 1276 while (IsTrailingUnit(*--p)) { 1277 continue; 1278 } 1279 1280 MOZ_ASSERT(earliestPossibleStart <= p); 1281 1282 // But if we underflowed |WindowRadius|, adjust forward and stop. 1283 if (HalfWindowSize() > WindowRadius) { 1284 static_assert(WindowRadius > 3, 1285 "skipping over non-lead code units below must not " 1286 "advance past |offset|"); 1287 1288 while (IsTrailingUnit(*++p)) { 1289 continue; 1290 } 1291 1292 MOZ_ASSERT(HalfWindowSize() < WindowRadius); 1293 break; 1294 } 1295 } 1296 1297 MOZ_ASSERT(HalfWindowSize() <= WindowRadius); 1298 return offset - HalfWindowSize(); 1299 } 1300 1301 template <> 1302 size_t SourceUnits<char16_t>::findWindowEnd(size_t offset) const { 1303 const char16_t* const initial = codeUnitPtrAt(offset); 1304 const char16_t* p = initial; 1305 1306 auto HalfWindowSize = [&initial, &p]() { 1307 return PointerRangeSize(initial, p); 1308 }; 1309 1310 while (true) { 1311 MOZ_ASSERT(p <= limit_); 1312 MOZ_ASSERT(HalfWindowSize() <= WindowRadius); 1313 if (p >= limit_ || HalfWindowSize() >= WindowRadius) { 1314 break; 1315 } 1316 1317 char16_t c = *p; 1318 1319 // This stops at U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR in 1320 // string and template literals. These code points do affect line and 1321 // column coordinates, even as they encode their literal values. 1322 if (IsLineTerminator(c)) { 1323 break; 1324 } 1325 1326 // Don't allow invalid UTF-16 in post-context. (Current users don't 1327 // require this, and this behavior isn't currently imposed on 1328 // pre-context, but these facts might change someday.) 1329 1330 if (MOZ_UNLIKELY(unicode::IsTrailSurrogate(c))) { 1331 break; 1332 } 1333 1334 // Optimistically consume the code unit, ungetting it below if needed. 1335 p++; 1336 1337 // If it's not a surrogate at all, keep going. 1338 if (MOZ_LIKELY(!unicode::IsLeadSurrogate(c))) { 1339 continue; 1340 } 1341 1342 // Retract if the lead surrogate would stand alone at the end of the 1343 // window. 1344 if (HalfWindowSize() >= WindowRadius || // split pair 1345 p >= limit_ || // half-pair at end of source 1346 !unicode::IsTrailSurrogate(*p)) // no paired trail surrogate 1347 { 1348 p--; 1349 break; 1350 } 1351 1352 p++; 1353 } 1354 1355 return offset + HalfWindowSize(); 1356 } 1357 1358 template <> 1359 size_t SourceUnits<Utf8Unit>::findWindowEnd(size_t offset) const { 1360 const Utf8Unit* const initial = codeUnitPtrAt(offset); 1361 const Utf8Unit* p = initial; 1362 1363 auto HalfWindowSize = [&initial, &p]() { 1364 return PointerRangeSize(initial, p); 1365 }; 1366 1367 while (true) { 1368 MOZ_ASSERT(p <= limit_); 1369 MOZ_ASSERT(HalfWindowSize() <= WindowRadius); 1370 if (p >= limit_ || HalfWindowSize() >= WindowRadius) { 1371 break; 1372 } 1373 1374 // A non-encoding error might be followed by an encoding error within 1375 // |maxEnd|, so we must validate as we go to not include invalid UTF-8 1376 // in the computed window. What joy! 1377 1378 Utf8Unit lead = *p; 1379 if (mozilla::IsAscii(lead)) { 1380 if (IsSingleUnitLineTerminator(lead)) { 1381 break; 1382 } 1383 1384 p++; 1385 continue; 1386 } 1387 1388 PeekedCodePoint<Utf8Unit> peeked = PeekCodePoint(p, limit_); 1389 if (peeked.isNone()) { 1390 break; // encoding error 1391 } 1392 1393 char32_t c = peeked.codePoint(); 1394 if (MOZ_UNLIKELY(c == unicode::LINE_SEPARATOR || 1395 c == unicode::PARA_SEPARATOR)) { 1396 break; 1397 } 1398 1399 MOZ_ASSERT(!IsLineTerminator(c)); 1400 1401 uint8_t len = peeked.lengthInUnits(); 1402 if (HalfWindowSize() + len > WindowRadius) { 1403 break; 1404 } 1405 1406 p += len; 1407 } 1408 1409 MOZ_ASSERT(HalfWindowSize() <= WindowRadius); 1410 return offset + HalfWindowSize(); 1411 } 1412 1413 template <typename Unit, class AnyCharsAccess> 1414 bool TokenStreamSpecific<Unit, AnyCharsAccess>::advance(size_t position) { 1415 const Unit* end = this->sourceUnits.codeUnitPtrAt(position); 1416 while (this->sourceUnits.addressOfNextCodeUnit() < end) { 1417 if (!getCodePoint()) { 1418 return false; 1419 } 1420 } 1421 1422 TokenStreamAnyChars& anyChars = anyCharsAccess(); 1423 Token* cur = const_cast<Token*>(&anyChars.currentToken()); 1424 cur->pos.begin = this->sourceUnits.offset(); 1425 cur->pos.end = cur->pos.begin; 1426 #ifdef DEBUG 1427 cur->type = TokenKind::Limit; 1428 #endif 1429 MOZ_MAKE_MEM_UNDEFINED(&cur->type, sizeof(cur->type)); 1430 anyChars.lookahead = 0; 1431 return true; 1432 } 1433 1434 template <typename Unit, class AnyCharsAccess> 1435 void TokenStreamSpecific<Unit, AnyCharsAccess>::seekTo(const Position& pos) { 1436 TokenStreamAnyChars& anyChars = anyCharsAccess(); 1437 1438 this->sourceUnits.setAddressOfNextCodeUnit(pos.buf, 1439 /* allowPoisoned = */ true); 1440 anyChars.flags = pos.flags; 1441 anyChars.lineno = pos.lineno; 1442 anyChars.linebase = pos.linebase; 1443 anyChars.prevLinebase = pos.prevLinebase; 1444 anyChars.lookahead = pos.lookahead; 1445 1446 anyChars.tokens[anyChars.cursor()] = pos.currentToken; 1447 for (unsigned i = 0; i < anyChars.lookahead; i++) { 1448 anyChars.tokens[anyChars.aheadCursor(1 + i)] = pos.lookaheadTokens[i]; 1449 } 1450 } 1451 1452 template <typename Unit, class AnyCharsAccess> 1453 bool TokenStreamSpecific<Unit, AnyCharsAccess>::seekTo( 1454 const Position& pos, const TokenStreamAnyChars& other) { 1455 if (!anyCharsAccess().srcCoords.fill(other.srcCoords)) { 1456 return false; 1457 } 1458 1459 seekTo(pos); 1460 return true; 1461 } 1462 1463 void TokenStreamAnyChars::computeErrorMetadataNoOffset( 1464 ErrorMetadata* err) const { 1465 err->isMuted = mutedErrors; 1466 err->filename = filename_; 1467 err->lineNumber = 0; 1468 err->columnNumber = JS::ColumnNumberOneOrigin(); 1469 1470 MOZ_ASSERT(err->lineOfContext == nullptr); 1471 } 1472 1473 bool TokenStreamAnyChars::fillExceptingContext(ErrorMetadata* err, 1474 uint32_t offset) const { 1475 err->isMuted = mutedErrors; 1476 1477 // If this TokenStreamAnyChars doesn't have location information, try to 1478 // get it from the caller. 1479 if (!filename_) { 1480 JSContext* maybeCx = context()->maybeCurrentJSContext(); 1481 if (maybeCx) { 1482 NonBuiltinFrameIter iter(maybeCx, 1483 FrameIter::FOLLOW_DEBUGGER_EVAL_PREV_LINK, 1484 maybeCx->realm()->principals()); 1485 if (!iter.done() && iter.filename()) { 1486 err->filename = JS::ConstUTF8CharsZ(iter.filename()); 1487 JS::TaggedColumnNumberOneOrigin columnNumber; 1488 err->lineNumber = iter.computeLine(&columnNumber); 1489 err->columnNumber = 1490 JS::ColumnNumberOneOrigin(columnNumber.oneOriginValue()); 1491 return false; 1492 } 1493 } 1494 } 1495 1496 // Otherwise use this TokenStreamAnyChars's location information. 1497 err->filename = filename_; 1498 return true; 1499 } 1500 1501 template <> 1502 inline void SourceUnits<char16_t>::computeWindowOffsetAndLength( 1503 const char16_t* encodedWindow, size_t encodedTokenOffset, 1504 size_t* utf16TokenOffset, size_t encodedWindowLength, 1505 size_t* utf16WindowLength) const { 1506 MOZ_ASSERT_UNREACHABLE("shouldn't need to recompute for UTF-16"); 1507 } 1508 1509 template <> 1510 inline void SourceUnits<Utf8Unit>::computeWindowOffsetAndLength( 1511 const Utf8Unit* encodedWindow, size_t encodedTokenOffset, 1512 size_t* utf16TokenOffset, size_t encodedWindowLength, 1513 size_t* utf16WindowLength) const { 1514 MOZ_ASSERT(encodedTokenOffset <= encodedWindowLength, 1515 "token offset must be within the window, and the two lambda " 1516 "calls below presume this ordering of values"); 1517 1518 const Utf8Unit* const encodedWindowEnd = encodedWindow + encodedWindowLength; 1519 1520 size_t i = 0; 1521 auto ComputeUtf16Count = [&i, &encodedWindow](const Utf8Unit* limit) { 1522 while (encodedWindow < limit) { 1523 Utf8Unit lead = *encodedWindow++; 1524 if (MOZ_LIKELY(IsAscii(lead))) { 1525 // ASCII contributes a single UTF-16 code unit. 1526 i++; 1527 continue; 1528 } 1529 1530 Maybe<char32_t> cp = DecodeOneUtf8CodePoint(lead, &encodedWindow, limit); 1531 MOZ_ASSERT(cp.isSome(), 1532 "computed window should only contain valid UTF-8"); 1533 1534 i += unicode::IsSupplementary(cp.value()) ? 2 : 1; 1535 } 1536 1537 return i; 1538 }; 1539 1540 // Compute the token offset from |i == 0| and the initial |encodedWindow|. 1541 const Utf8Unit* token = encodedWindow + encodedTokenOffset; 1542 MOZ_ASSERT(token <= encodedWindowEnd); 1543 *utf16TokenOffset = ComputeUtf16Count(token); 1544 1545 // Compute the window length, picking up from |i| and |encodedWindow| that, 1546 // in general, were modified just above. 1547 *utf16WindowLength = ComputeUtf16Count(encodedWindowEnd); 1548 } 1549 1550 template <typename Unit> 1551 bool TokenStreamCharsBase<Unit>::addLineOfContext(ErrorMetadata* err, 1552 uint32_t offset) const { 1553 // Rename the variable to make meaning clearer: an offset into source units 1554 // in Unit encoding. 1555 size_t encodedOffset = offset; 1556 1557 // These are also offsets into source units in Unit encoding. 1558 size_t encodedWindowStart = sourceUnits.findWindowStart(encodedOffset); 1559 size_t encodedWindowEnd = sourceUnits.findWindowEnd(encodedOffset); 1560 1561 size_t encodedWindowLength = encodedWindowEnd - encodedWindowStart; 1562 MOZ_ASSERT(encodedWindowLength <= SourceUnits::WindowRadius * 2); 1563 1564 // Don't add a useless "line" of context when the window ends up empty 1565 // because of an invalid encoding at the start of a line. 1566 if (encodedWindowLength == 0) { 1567 MOZ_ASSERT(err->lineOfContext == nullptr, 1568 "ErrorMetadata::lineOfContext must be null so we don't " 1569 "have to set the lineLength/tokenOffset fields"); 1570 return true; 1571 } 1572 1573 CharBuffer lineOfContext(fc); 1574 1575 const Unit* encodedWindow = sourceUnits.codeUnitPtrAt(encodedWindowStart); 1576 if (!FillCharBufferFromSourceNormalizingAsciiLineBreaks( 1577 lineOfContext, encodedWindow, encodedWindow + encodedWindowLength)) { 1578 return false; 1579 } 1580 1581 size_t utf16WindowLength = lineOfContext.length(); 1582 1583 // The windowed string is null-terminated. 1584 if (!lineOfContext.append('\0')) { 1585 return false; 1586 } 1587 1588 err->lineOfContext.reset(lineOfContext.extractOrCopyRawBuffer()); 1589 if (!err->lineOfContext) { 1590 return false; 1591 } 1592 1593 size_t encodedTokenOffset = encodedOffset - encodedWindowStart; 1594 1595 MOZ_ASSERT(encodedTokenOffset <= encodedWindowLength, 1596 "token offset must be inside the window"); 1597 1598 // The length in UTF-8 code units of a code point is always greater than or 1599 // equal to the same code point's length in UTF-16 code points. ASCII code 1600 // points are 1 unit in either encoding. Code points in [U+0080, U+10000) 1601 // are 2-3 UTF-8 code units to 1 UTF-16 code unit. And code points in 1602 // [U+10000, U+10FFFF] are 4 UTF-8 code units to 2 UTF-16 code units. 1603 // 1604 // Therefore, if encoded window length equals the length in UTF-16 (this is 1605 // always the case for Unit=char16_t), the UTF-16 offsets are exactly the 1606 // encoded offsets. Otherwise we must convert offset/length from UTF-8 to 1607 // UTF-16. 1608 if constexpr (std::is_same_v<Unit, char16_t>) { 1609 MOZ_ASSERT(utf16WindowLength == encodedWindowLength, 1610 "UTF-16 to UTF-16 shouldn't change window length"); 1611 err->tokenOffset = encodedTokenOffset; 1612 err->lineLength = encodedWindowLength; 1613 } else { 1614 static_assert(std::is_same_v<Unit, Utf8Unit>, "should only see UTF-8 here"); 1615 1616 bool simple = utf16WindowLength == encodedWindowLength; 1617 MOZ_ASSERT(std::all_of(encodedWindow, encodedWindow + encodedWindowLength, 1618 [](Unit u) { return IsAscii(u); }) == simple, 1619 "equal window lengths in UTF-8 should correspond only to " 1620 "wholly-ASCII text"); 1621 if (simple) { 1622 err->tokenOffset = encodedTokenOffset; 1623 err->lineLength = encodedWindowLength; 1624 } else { 1625 sourceUnits.computeWindowOffsetAndLength( 1626 encodedWindow, encodedTokenOffset, &err->tokenOffset, 1627 encodedWindowLength, &err->lineLength); 1628 } 1629 } 1630 1631 return true; 1632 } 1633 1634 template <typename Unit, class AnyCharsAccess> 1635 bool TokenStreamSpecific<Unit, AnyCharsAccess>::computeErrorMetadata( 1636 ErrorMetadata* err, const ErrorOffset& errorOffset) const { 1637 if (errorOffset.is<NoOffset>()) { 1638 anyCharsAccess().computeErrorMetadataNoOffset(err); 1639 return true; 1640 } 1641 1642 uint32_t offset; 1643 if (errorOffset.is<uint32_t>()) { 1644 offset = errorOffset.as<uint32_t>(); 1645 } else { 1646 offset = this->sourceUnits.offset(); 1647 } 1648 1649 // This function's return value isn't a success/failure indication: it 1650 // returns true if this TokenStream can be used to provide a line of 1651 // context. 1652 if (fillExceptingContext(err, offset)) { 1653 // Add a line of context from this TokenStream to help with debugging. 1654 return internalComputeLineOfContext(err, offset); 1655 } 1656 1657 // We can't fill in any more here. 1658 return true; 1659 } 1660 1661 template <typename Unit, class AnyCharsAccess> 1662 void TokenStreamSpecific<Unit, AnyCharsAccess>::reportIllegalCharacter( 1663 int32_t cp) { 1664 UniqueChars display = JS_smprintf("U+%04X", cp); 1665 if (!display) { 1666 ReportOutOfMemory(anyCharsAccess().fc); 1667 return; 1668 } 1669 error(JSMSG_ILLEGAL_CHARACTER, display.get()); 1670 } 1671 1672 // We have encountered a '\': check for a Unicode escape sequence after it. 1673 // Return the length of the escape sequence and the encoded code point (by 1674 // value) if we found a Unicode escape sequence, and skip all code units 1675 // involed. Otherwise, return 0 and don't advance along the buffer. 1676 template <typename Unit, class AnyCharsAccess> 1677 uint32_t GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchUnicodeEscape( 1678 char32_t* codePoint) { 1679 MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\')); 1680 1681 int32_t unit = getCodeUnit(); 1682 if (unit != 'u') { 1683 // NOTE: |unit| may be EOF here. 1684 ungetCodeUnit(unit); 1685 MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\')); 1686 return 0; 1687 } 1688 1689 char16_t v; 1690 unit = getCodeUnit(); 1691 if (IsAsciiHexDigit(unit) && this->sourceUnits.matchHexDigits(3, &v)) { 1692 *codePoint = (AsciiAlphanumericToNumber(unit) << 12) | v; 1693 return 5; 1694 } 1695 1696 if (unit == '{') { 1697 return matchExtendedUnicodeEscape(codePoint); 1698 } 1699 1700 // NOTE: |unit| may be EOF here, so this ungets either one or two units. 1701 ungetCodeUnit(unit); 1702 ungetCodeUnit('u'); 1703 MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\')); 1704 return 0; 1705 } 1706 1707 template <typename Unit, class AnyCharsAccess> 1708 uint32_t 1709 GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchExtendedUnicodeEscape( 1710 char32_t* codePoint) { 1711 MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('{')); 1712 1713 int32_t unit = getCodeUnit(); 1714 1715 // Skip leading zeroes. 1716 uint32_t leadingZeroes = 0; 1717 while (unit == '0') { 1718 leadingZeroes++; 1719 unit = getCodeUnit(); 1720 } 1721 1722 size_t i = 0; 1723 uint32_t code = 0; 1724 while (IsAsciiHexDigit(unit) && i < 6) { 1725 code = (code << 4) | AsciiAlphanumericToNumber(unit); 1726 unit = getCodeUnit(); 1727 i++; 1728 } 1729 1730 uint32_t gotten = 1731 2 + // 'u{' 1732 leadingZeroes + i + // significant hexdigits 1733 (unit != EOF); // subtract a get if it didn't contribute to length 1734 1735 if (unit == '}' && (leadingZeroes > 0 || i > 0) && 1736 code <= unicode::NonBMPMax) { 1737 *codePoint = code; 1738 return gotten; 1739 } 1740 1741 this->sourceUnits.unskipCodeUnits(gotten); 1742 MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\')); 1743 return 0; 1744 } 1745 1746 template <typename Unit, class AnyCharsAccess> 1747 uint32_t 1748 GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchUnicodeEscapeIdStart( 1749 char32_t* codePoint) { 1750 uint32_t length = matchUnicodeEscape(codePoint); 1751 if (MOZ_LIKELY(length > 0)) { 1752 if (MOZ_LIKELY(unicode::IsIdentifierStart(*codePoint))) { 1753 return length; 1754 } 1755 1756 this->sourceUnits.unskipCodeUnits(length); 1757 } 1758 1759 MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\')); 1760 return 0; 1761 } 1762 1763 template <typename Unit, class AnyCharsAccess> 1764 bool GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchUnicodeEscapeIdent( 1765 char32_t* codePoint) { 1766 uint32_t length = matchUnicodeEscape(codePoint); 1767 if (MOZ_LIKELY(length > 0)) { 1768 if (MOZ_LIKELY(unicode::IsIdentifierPart(*codePoint))) { 1769 return true; 1770 } 1771 1772 this->sourceUnits.unskipCodeUnits(length); 1773 } 1774 1775 MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\')); 1776 return false; 1777 } 1778 1779 template <typename Unit, class AnyCharsAccess> 1780 [[nodiscard]] bool 1781 TokenStreamSpecific<Unit, AnyCharsAccess>::matchIdentifierStart( 1782 IdentifierEscapes* sawEscape) { 1783 int32_t unit = getCodeUnit(); 1784 if (unit == EOF) { 1785 error(JSMSG_MISSING_PRIVATE_NAME); 1786 return false; 1787 } 1788 1789 if (MOZ_LIKELY(isAsciiCodePoint(unit))) { 1790 if (unicode::IsIdentifierStart(char16_t(unit))) { 1791 *sawEscape = IdentifierEscapes::None; 1792 return true; 1793 } 1794 1795 if (unit == '\\') { 1796 char32_t codePoint; 1797 uint32_t escapeLength = matchUnicodeEscapeIdStart(&codePoint); 1798 if (escapeLength != 0) { 1799 *sawEscape = IdentifierEscapes::SawUnicodeEscape; 1800 return true; 1801 } 1802 1803 // We could point "into" a mistyped escape, e.g. for "\u{41H}" we 1804 // could point at the 'H'. But we don't do that now, so the code 1805 // unit after the '\' isn't necessarily bad, so just point at the 1806 // start of the actually-invalid escape. 1807 ungetCodeUnit('\\'); 1808 error(JSMSG_BAD_ESCAPE); 1809 return false; 1810 } 1811 } 1812 1813 // Unget the lead code unit before peeking at the full code point. 1814 ungetCodeUnit(unit); 1815 1816 PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint(); 1817 if (!peeked.isNone() && unicode::IsIdentifierStart(peeked.codePoint())) { 1818 this->sourceUnits.consumeKnownCodePoint(peeked); 1819 1820 *sawEscape = IdentifierEscapes::None; 1821 return true; 1822 } 1823 1824 error(JSMSG_MISSING_PRIVATE_NAME); 1825 return false; 1826 } 1827 1828 template <typename Unit, class AnyCharsAccess> 1829 bool TokenStreamSpecific<Unit, AnyCharsAccess>::getDirectives( 1830 bool isMultiline, bool shouldWarnDeprecated) { 1831 // Match directive comments used in debugging, such as "//# sourceURL" and 1832 // "//# sourceMappingURL". Use of "//@" instead of "//#" is deprecated. 1833 // 1834 // To avoid a crashing bug in IE, several JavaScript transpilers wrap single 1835 // line comments containing a source mapping URL inside a multiline 1836 // comment. To avoid potentially expensive lookahead and backtracking, we 1837 // only check for this case if we encounter a '#' code unit. 1838 1839 bool res = getDisplayURL(isMultiline, shouldWarnDeprecated) && 1840 getSourceMappingURL(isMultiline, shouldWarnDeprecated); 1841 if (!res) { 1842 badToken(); 1843 } 1844 1845 return res; 1846 } 1847 1848 [[nodiscard]] bool TokenStreamCharsShared::copyCharBufferTo( 1849 UniquePtr<char16_t[], JS::FreePolicy>* destination) { 1850 size_t length = charBuffer.length(); 1851 1852 *destination = fc->getAllocator()->make_pod_array<char16_t>(length + 1); 1853 if (!*destination) { 1854 return false; 1855 } 1856 1857 std::copy(charBuffer.begin(), charBuffer.end(), destination->get()); 1858 (*destination)[length] = '\0'; 1859 return true; 1860 } 1861 1862 template <typename Unit, class AnyCharsAccess> 1863 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::getDirective( 1864 bool isMultiline, bool shouldWarnDeprecated, const char* directive, 1865 uint8_t directiveLength, const char* errorMsgPragma, 1866 UniquePtr<char16_t[], JS::FreePolicy>* destination) { 1867 // Stop if we don't find |directive|. (Note that |directive| must be 1868 // ASCII, so there are no tricky encoding issues to consider in matching 1869 // UTF-8/16-agnostically.) 1870 if (!this->sourceUnits.matchCodeUnits(directive, directiveLength)) { 1871 return true; 1872 } 1873 1874 if (shouldWarnDeprecated) { 1875 if (!warning(JSMSG_DEPRECATED_PRAGMA, errorMsgPragma)) { 1876 return false; 1877 } 1878 } 1879 1880 this->charBuffer.clear(); 1881 1882 do { 1883 int32_t unit = peekCodeUnit(); 1884 if (unit == EOF) { 1885 break; 1886 } 1887 1888 if (MOZ_LIKELY(isAsciiCodePoint(unit))) { 1889 if (unicode::IsSpace(AssertedCast<Latin1Char>(unit))) { 1890 break; 1891 } 1892 1893 consumeKnownCodeUnit(unit); 1894 1895 // Debugging directives can occur in both single- and multi-line 1896 // comments. If we're currently inside a multi-line comment, we 1897 // also must recognize multi-line comment terminators. 1898 if (isMultiline && unit == '*' && peekCodeUnit() == '/') { 1899 ungetCodeUnit('*'); 1900 break; 1901 } 1902 1903 if (!this->charBuffer.append(unit)) { 1904 return false; 1905 } 1906 1907 continue; 1908 } 1909 1910 // This ignores encoding errors: subsequent caller-side code to 1911 // handle the remaining source text in the comment will do so. 1912 PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint(); 1913 if (peeked.isNone() || unicode::IsSpace(peeked.codePoint())) { 1914 break; 1915 } 1916 1917 MOZ_ASSERT(!IsLineTerminator(peeked.codePoint()), 1918 "!IsSpace must imply !IsLineTerminator or else we'll fail to " 1919 "maintain line-info/flags for EOL"); 1920 this->sourceUnits.consumeKnownCodePoint(peeked); 1921 1922 if (!AppendCodePointToCharBuffer(this->charBuffer, peeked.codePoint())) { 1923 return false; 1924 } 1925 } while (true); 1926 1927 if (this->charBuffer.empty()) { 1928 // The directive's URL was missing, but comments can contain anything, 1929 // so it isn't an error. 1930 return true; 1931 } 1932 1933 return copyCharBufferTo(destination); 1934 } 1935 1936 template <typename Unit, class AnyCharsAccess> 1937 bool TokenStreamSpecific<Unit, AnyCharsAccess>::getDisplayURL( 1938 bool isMultiline, bool shouldWarnDeprecated) { 1939 // Match comments of the form "//# sourceURL=<url>" or 1940 // "/\* //# sourceURL=<url> *\/" 1941 // 1942 // Note that while these are labeled "sourceURL" in the source text, 1943 // internally we refer to it as a "displayURL" to distinguish what the 1944 // developer would like to refer to the source as from the source's actual 1945 // URL. 1946 1947 static constexpr char sourceURLDirective[] = " sourceURL="; 1948 constexpr uint8_t sourceURLDirectiveLength = js_strlen(sourceURLDirective); 1949 return getDirective(isMultiline, shouldWarnDeprecated, sourceURLDirective, 1950 sourceURLDirectiveLength, "sourceURL", 1951 &anyCharsAccess().displayURL_); 1952 } 1953 1954 template <typename Unit, class AnyCharsAccess> 1955 bool TokenStreamSpecific<Unit, AnyCharsAccess>::getSourceMappingURL( 1956 bool isMultiline, bool shouldWarnDeprecated) { 1957 // Match comments of the form "//# sourceMappingURL=<url>" or 1958 // "/\* //# sourceMappingURL=<url> *\/" 1959 1960 static constexpr char sourceMappingURLDirective[] = " sourceMappingURL="; 1961 constexpr uint8_t sourceMappingURLDirectiveLength = 1962 js_strlen(sourceMappingURLDirective); 1963 return getDirective(isMultiline, shouldWarnDeprecated, 1964 sourceMappingURLDirective, 1965 sourceMappingURLDirectiveLength, "sourceMappingURL", 1966 &anyCharsAccess().sourceMapURL_); 1967 } 1968 1969 template <typename Unit, class AnyCharsAccess> 1970 MOZ_ALWAYS_INLINE Token* 1971 GeneralTokenStreamChars<Unit, AnyCharsAccess>::newTokenInternal( 1972 TokenKind kind, TokenStart start, TokenKind* out) { 1973 MOZ_ASSERT(kind < TokenKind::Limit); 1974 MOZ_ASSERT(kind != TokenKind::Eol, 1975 "TokenKind::Eol should never be used in an actual Token, only " 1976 "returned by peekTokenSameLine()"); 1977 1978 TokenStreamAnyChars& anyChars = anyCharsAccess(); 1979 anyChars.flags.isDirtyLine = true; 1980 1981 Token* token = anyChars.allocateToken(); 1982 1983 *out = token->type = kind; 1984 token->pos = TokenPos(start.offset(), this->sourceUnits.offset()); 1985 MOZ_ASSERT(token->pos.begin <= token->pos.end); 1986 1987 // NOTE: |token->modifier| is set in |newToken()| so that optimized, 1988 // non-debug code won't do any work to pass a modifier-argument that will 1989 // never be used. 1990 1991 return token; 1992 } 1993 1994 template <typename Unit, class AnyCharsAccess> 1995 MOZ_COLD bool GeneralTokenStreamChars<Unit, AnyCharsAccess>::badToken() { 1996 // We didn't get a token, so don't set |flags.isDirtyLine|. 1997 anyCharsAccess().flags.hadError = true; 1998 1999 // Poisoning sourceUnits on error establishes an invariant: once an 2000 // erroneous token has been seen, sourceUnits will not be consulted again. 2001 // This is true because the parser will deal with the illegal token by 2002 // aborting parsing immediately. 2003 this->sourceUnits.poisonInDebug(); 2004 2005 return false; 2006 }; 2007 2008 bool AppendCodePointToCharBuffer(CharBuffer& charBuffer, char32_t codePoint) { 2009 MOZ_ASSERT(codePoint <= unicode::NonBMPMax, 2010 "should only be processing code points validly decoded from UTF-8 " 2011 "or WTF-16 source text (surrogate code points permitted)"); 2012 2013 char16_t units[2]; 2014 unsigned numUnits = 0; 2015 unicode::UTF16Encode(codePoint, units, &numUnits); 2016 2017 MOZ_ASSERT(numUnits == 1 || numUnits == 2, 2018 "UTF-16 code points are only encoded in one or two units"); 2019 2020 if (!charBuffer.append(units[0])) { 2021 return false; 2022 } 2023 2024 if (numUnits == 1) { 2025 return true; 2026 } 2027 2028 return charBuffer.append(units[1]); 2029 } 2030 2031 template <typename Unit, class AnyCharsAccess> 2032 bool TokenStreamSpecific<Unit, AnyCharsAccess>::putIdentInCharBuffer( 2033 const Unit* identStart) { 2034 const Unit* const originalAddress = this->sourceUnits.addressOfNextCodeUnit(); 2035 this->sourceUnits.setAddressOfNextCodeUnit(identStart); 2036 2037 auto restoreNextRawCharAddress = MakeScopeExit([this, originalAddress]() { 2038 this->sourceUnits.setAddressOfNextCodeUnit(originalAddress); 2039 }); 2040 2041 this->charBuffer.clear(); 2042 do { 2043 int32_t unit = getCodeUnit(); 2044 if (unit == EOF) { 2045 break; 2046 } 2047 2048 char32_t codePoint; 2049 if (MOZ_LIKELY(isAsciiCodePoint(unit))) { 2050 if (unicode::IsIdentifierPart(char16_t(unit)) || unit == '#') { 2051 if (!this->charBuffer.append(unit)) { 2052 return false; 2053 } 2054 2055 continue; 2056 } 2057 2058 if (unit != '\\' || !matchUnicodeEscapeIdent(&codePoint)) { 2059 break; 2060 } 2061 } else { 2062 // |restoreNextRawCharAddress| undoes all gets, and this function 2063 // doesn't update line/column info. 2064 char32_t cp; 2065 if (!getNonAsciiCodePointDontNormalize(toUnit(unit), &cp)) { 2066 return false; 2067 } 2068 2069 codePoint = cp; 2070 if (!unicode::IsIdentifierPart(codePoint)) { 2071 break; 2072 } 2073 } 2074 2075 if (!AppendCodePointToCharBuffer(this->charBuffer, codePoint)) { 2076 return false; 2077 } 2078 } while (true); 2079 2080 return true; 2081 } 2082 2083 template <typename Unit, class AnyCharsAccess> 2084 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::identifierName( 2085 TokenStart start, const Unit* identStart, IdentifierEscapes escaping, 2086 Modifier modifier, NameVisibility visibility, TokenKind* out) { 2087 // Run the bad-token code for every path out of this function except the 2088 // two success-cases. 2089 auto noteBadToken = MakeScopeExit([this]() { this->badToken(); }); 2090 2091 // We've already consumed an initial code point in the identifer, to *know* 2092 // that this is an identifier. So no need to worry about not consuming any 2093 // code points in the loop below. 2094 int32_t unit; 2095 while (true) { 2096 unit = peekCodeUnit(); 2097 if (unit == EOF) { 2098 break; 2099 } 2100 2101 if (MOZ_LIKELY(isAsciiCodePoint(unit))) { 2102 consumeKnownCodeUnit(unit); 2103 2104 if (MOZ_UNLIKELY( 2105 !unicode::IsIdentifierPart(static_cast<char16_t>(unit)))) { 2106 // Handle a Unicode escape -- otherwise it's not part of the 2107 // identifier. 2108 char32_t codePoint; 2109 if (unit != '\\' || !matchUnicodeEscapeIdent(&codePoint)) { 2110 ungetCodeUnit(unit); 2111 break; 2112 } 2113 2114 escaping = IdentifierEscapes::SawUnicodeEscape; 2115 } 2116 } else { 2117 // This ignores encoding errors: subsequent caller-side code to 2118 // handle source text after the IdentifierName will do so. 2119 PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint(); 2120 if (peeked.isNone() || !unicode::IsIdentifierPart(peeked.codePoint())) { 2121 break; 2122 } 2123 2124 MOZ_ASSERT(!IsLineTerminator(peeked.codePoint()), 2125 "IdentifierPart must guarantee !IsLineTerminator or " 2126 "else we'll fail to maintain line-info/flags for EOL"); 2127 2128 this->sourceUnits.consumeKnownCodePoint(peeked); 2129 } 2130 } 2131 2132 TaggedParserAtomIndex atom; 2133 if (MOZ_UNLIKELY(escaping == IdentifierEscapes::SawUnicodeEscape)) { 2134 // Identifiers containing Unicode escapes have to be converted into 2135 // tokenbuf before atomizing. 2136 if (!putIdentInCharBuffer(identStart)) { 2137 return false; 2138 } 2139 2140 atom = drainCharBufferIntoAtom(); 2141 } else { 2142 // Escape-free identifiers can be created directly from sourceUnits. 2143 const Unit* chars = identStart; 2144 size_t length = this->sourceUnits.addressOfNextCodeUnit() - identStart; 2145 2146 // Private identifiers start with a '#', and so cannot be reserved words. 2147 if (visibility == NameVisibility::Public) { 2148 // Represent reserved words lacking escapes as reserved word tokens. 2149 if (const ReservedWordInfo* rw = FindReservedWord(chars, length)) { 2150 noteBadToken.release(); 2151 newSimpleToken(rw->tokentype, start, modifier, out); 2152 return true; 2153 } 2154 } 2155 2156 atom = atomizeSourceChars(Span(chars, length)); 2157 } 2158 if (!atom) { 2159 return false; 2160 } 2161 2162 noteBadToken.release(); 2163 if (visibility == NameVisibility::Private) { 2164 newPrivateNameToken(atom, start, modifier, out); 2165 return true; 2166 } 2167 newNameToken(atom, start, modifier, out); 2168 return true; 2169 } 2170 2171 enum FirstCharKind { 2172 // A char16_t has the 'OneChar' kind if it, by itself, constitutes a valid 2173 // token that cannot also be a prefix of a longer token. E.g. ';' has the 2174 // OneChar kind, but '+' does not, because '++' and '+=' are valid longer 2175 // tokens 2176 // that begin with '+'. 2177 // 2178 // The few token kinds satisfying these properties cover roughly 35--45% 2179 // of the tokens seen in practice. 2180 // 2181 // We represent the 'OneChar' kind with any positive value less than 2182 // TokenKind::Limit. This representation lets us associate 2183 // each one-char token char16_t with a TokenKind and thus avoid 2184 // a subsequent char16_t-to-TokenKind conversion. 2185 OneChar_Min = 0, 2186 OneChar_Max = size_t(TokenKind::Limit) - 1, 2187 2188 Space = size_t(TokenKind::Limit), 2189 Ident, 2190 Dec, 2191 String, 2192 EOL, 2193 ZeroDigit, 2194 Other, 2195 2196 LastCharKind = Other 2197 }; 2198 2199 // OneChar: 40, 41, 44, 58, 59, 91, 93, 123, 125, 126: 2200 // '(', ')', ',', ':', ';', '[', ']', '{', '}', '~' 2201 // Ident: 36, 65..90, 95, 97..122: '$', 'A'..'Z', '_', 'a'..'z' 2202 // Dot: 46: '.' 2203 // Equals: 61: '=' 2204 // String: 34, 39, 96: '"', '\'', '`' 2205 // Dec: 49..57: '1'..'9' 2206 // Plus: 43: '+' 2207 // ZeroDigit: 48: '0' 2208 // Space: 9, 11, 12, 32: '\t', '\v', '\f', ' ' 2209 // EOL: 10, 13: '\n', '\r' 2210 // 2211 #define T_COMMA size_t(TokenKind::Comma) 2212 #define T_COLON size_t(TokenKind::Colon) 2213 #define T_BITNOT size_t(TokenKind::BitNot) 2214 #define T_LP size_t(TokenKind::LeftParen) 2215 #define T_RP size_t(TokenKind::RightParen) 2216 #define T_SEMI size_t(TokenKind::Semi) 2217 #define T_LB size_t(TokenKind::LeftBracket) 2218 #define T_RB size_t(TokenKind::RightBracket) 2219 #define T_LC size_t(TokenKind::LeftCurly) 2220 #define T_RC size_t(TokenKind::RightCurly) 2221 #define _______ Other 2222 static const uint8_t firstCharKinds[] = { 2223 // clang-format off 2224 /* 0 1 2 3 4 5 6 7 8 9 */ 2225 /* 0+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, Space, 2226 /* 10+ */ EOL, Space, Space, EOL, _______, _______, _______, _______, _______, _______, 2227 /* 20+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, 2228 /* 30+ */ _______, _______, Space, _______, String, _______, Ident, _______, _______, String, 2229 /* 40+ */ T_LP, T_RP, _______, _______, T_COMMA, _______, _______, _______,ZeroDigit, Dec, 2230 /* 50+ */ Dec, Dec, Dec, Dec, Dec, Dec, Dec, Dec, T_COLON, T_SEMI, 2231 /* 60+ */ _______, _______, _______, _______, _______, Ident, Ident, Ident, Ident, Ident, 2232 /* 70+ */ Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, 2233 /* 80+ */ Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, 2234 /* 90+ */ Ident, T_LB, _______, T_RB, _______, Ident, String, Ident, Ident, Ident, 2235 /* 100+ */ Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, 2236 /* 110+ */ Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, 2237 /* 120+ */ Ident, Ident, Ident, T_LC, _______, T_RC,T_BITNOT, _______ 2238 // clang-format on 2239 }; 2240 #undef T_COMMA 2241 #undef T_COLON 2242 #undef T_BITNOT 2243 #undef T_LP 2244 #undef T_RP 2245 #undef T_SEMI 2246 #undef T_LB 2247 #undef T_RB 2248 #undef T_LC 2249 #undef T_RC 2250 #undef _______ 2251 2252 static_assert(LastCharKind < (1 << (sizeof(firstCharKinds[0]) * 8)), 2253 "Elements of firstCharKinds[] are too small"); 2254 2255 template <> 2256 void SourceUnits<char16_t>::consumeRestOfSingleLineComment() { 2257 while (MOZ_LIKELY(!atEnd())) { 2258 char16_t unit = peekCodeUnit(); 2259 if (IsLineTerminator(unit)) { 2260 return; 2261 } 2262 2263 consumeKnownCodeUnit(unit); 2264 } 2265 } 2266 2267 template <> 2268 void SourceUnits<Utf8Unit>::consumeRestOfSingleLineComment() { 2269 while (MOZ_LIKELY(!atEnd())) { 2270 const Utf8Unit unit = peekCodeUnit(); 2271 if (IsSingleUnitLineTerminator(unit)) { 2272 return; 2273 } 2274 2275 if (MOZ_LIKELY(IsAscii(unit))) { 2276 consumeKnownCodeUnit(unit); 2277 continue; 2278 } 2279 2280 PeekedCodePoint<Utf8Unit> peeked = peekCodePoint(); 2281 if (peeked.isNone()) { 2282 return; 2283 } 2284 2285 char32_t c = peeked.codePoint(); 2286 if (MOZ_UNLIKELY(c == unicode::LINE_SEPARATOR || 2287 c == unicode::PARA_SEPARATOR)) { 2288 return; 2289 } 2290 2291 consumeKnownCodePoint(peeked); 2292 } 2293 } 2294 2295 template <typename Unit, class AnyCharsAccess> 2296 [[nodiscard]] MOZ_ALWAYS_INLINE bool 2297 TokenStreamSpecific<Unit, AnyCharsAccess>::matchInteger( 2298 IsIntegerUnit isIntegerUnit, int32_t* nextUnit) { 2299 int32_t unit = getCodeUnit(); 2300 if (!isIntegerUnit(unit)) { 2301 *nextUnit = unit; 2302 return true; 2303 } 2304 return matchIntegerAfterFirstDigit(isIntegerUnit, nextUnit); 2305 } 2306 2307 template <typename Unit, class AnyCharsAccess> 2308 [[nodiscard]] MOZ_ALWAYS_INLINE bool 2309 TokenStreamSpecific<Unit, AnyCharsAccess>::matchIntegerAfterFirstDigit( 2310 IsIntegerUnit isIntegerUnit, int32_t* nextUnit) { 2311 int32_t unit; 2312 while (true) { 2313 unit = getCodeUnit(); 2314 if (isIntegerUnit(unit)) { 2315 continue; 2316 } 2317 if (unit != '_') { 2318 break; 2319 } 2320 unit = getCodeUnit(); 2321 if (!isIntegerUnit(unit)) { 2322 if (unit == '_') { 2323 ungetCodeUnit(unit); 2324 error(JSMSG_NUMBER_MULTIPLE_ADJACENT_UNDERSCORES); 2325 } else { 2326 ungetCodeUnit(unit); 2327 ungetCodeUnit('_'); 2328 error(JSMSG_NUMBER_END_WITH_UNDERSCORE); 2329 } 2330 return false; 2331 } 2332 } 2333 2334 *nextUnit = unit; 2335 return true; 2336 } 2337 2338 template <typename Unit, class AnyCharsAccess> 2339 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::decimalNumber( 2340 int32_t unit, TokenStart start, const Unit* numStart, Modifier modifier, 2341 TokenKind* out) { 2342 // Run the bad-token code for every path out of this function except the 2343 // one success-case. 2344 auto noteBadToken = MakeScopeExit([this]() { this->badToken(); }); 2345 2346 // Consume integral component digits. 2347 if (IsAsciiDigit(unit)) { 2348 if (!matchIntegerAfterFirstDigit(IsAsciiDigit, &unit)) { 2349 return false; 2350 } 2351 } 2352 2353 // Numbers contain no escapes, so we can read directly from |sourceUnits|. 2354 double dval; 2355 bool isBigInt = false; 2356 DecimalPoint decimalPoint = NoDecimal; 2357 if (unit != '.' && unit != 'e' && unit != 'E' && unit != 'n') { 2358 // NOTE: |unit| may be EOF here. 2359 ungetCodeUnit(unit); 2360 2361 // Most numbers are pure decimal integers without fractional component 2362 // or exponential notation. Handle that with optimized code. 2363 if (!GetDecimalInteger(numStart, this->sourceUnits.addressOfNextCodeUnit(), 2364 &dval)) { 2365 ReportOutOfMemory(this->fc); 2366 return false; 2367 } 2368 } else if (unit == 'n') { 2369 isBigInt = true; 2370 unit = peekCodeUnit(); 2371 } else { 2372 // Consume any decimal dot and fractional component. 2373 if (unit == '.') { 2374 decimalPoint = HasDecimal; 2375 if (!matchInteger(IsAsciiDigit, &unit)) { 2376 return false; 2377 } 2378 } 2379 2380 // Consume any exponential notation. 2381 if (unit == 'e' || unit == 'E') { 2382 unit = getCodeUnit(); 2383 if (unit == '+' || unit == '-') { 2384 unit = getCodeUnit(); 2385 } 2386 2387 // Exponential notation must contain at least one digit. 2388 if (!IsAsciiDigit(unit)) { 2389 ungetCodeUnit(unit); 2390 error(JSMSG_MISSING_EXPONENT); 2391 return false; 2392 } 2393 2394 // Consume exponential digits. 2395 if (!matchIntegerAfterFirstDigit(IsAsciiDigit, &unit)) { 2396 return false; 2397 } 2398 } 2399 2400 ungetCodeUnit(unit); 2401 2402 if (!GetDecimal(numStart, this->sourceUnits.addressOfNextCodeUnit(), 2403 &dval)) { 2404 ReportOutOfMemory(this->fc); 2405 return false; 2406 } 2407 } 2408 2409 // Number followed by IdentifierStart is an error. (This is the only place 2410 // in ECMAScript where token boundary is inadequate to properly separate 2411 // two tokens, necessitating this unaesthetic lookahead.) 2412 if (unit != EOF) { 2413 if (MOZ_LIKELY(isAsciiCodePoint(unit))) { 2414 if (unicode::IsIdentifierStart(char16_t(unit))) { 2415 error(JSMSG_IDSTART_AFTER_NUMBER); 2416 return false; 2417 } 2418 } else { 2419 // This ignores encoding errors: subsequent caller-side code to 2420 // handle source text after the number will do so. 2421 PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint(); 2422 if (!peeked.isNone() && unicode::IsIdentifierStart(peeked.codePoint())) { 2423 error(JSMSG_IDSTART_AFTER_NUMBER); 2424 return false; 2425 } 2426 } 2427 } 2428 2429 noteBadToken.release(); 2430 2431 if (isBigInt) { 2432 return bigIntLiteral(start, modifier, out); 2433 } 2434 2435 newNumberToken(dval, decimalPoint, start, modifier, out); 2436 return true; 2437 } 2438 2439 template <typename Unit, class AnyCharsAccess> 2440 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::regexpLiteral( 2441 TokenStart start, TokenKind* out) { 2442 MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('/')); 2443 this->charBuffer.clear(); 2444 2445 auto ProcessNonAsciiCodePoint = [this](int32_t lead) { 2446 MOZ_ASSERT(lead != EOF); 2447 MOZ_ASSERT(!this->isAsciiCodePoint(lead)); 2448 2449 char32_t codePoint; 2450 if (!this->getNonAsciiCodePointDontNormalize(this->toUnit(lead), 2451 &codePoint)) { 2452 return false; 2453 } 2454 2455 if (MOZ_UNLIKELY(codePoint == unicode::LINE_SEPARATOR || 2456 codePoint == unicode::PARA_SEPARATOR)) { 2457 this->sourceUnits.ungetLineOrParagraphSeparator(); 2458 this->error(JSMSG_UNTERMINATED_REGEXP); 2459 return false; 2460 } 2461 2462 return AppendCodePointToCharBuffer(this->charBuffer, codePoint); 2463 }; 2464 2465 auto ReportUnterminatedRegExp = [this](int32_t unit) { 2466 this->ungetCodeUnit(unit); 2467 this->error(JSMSG_UNTERMINATED_REGEXP); 2468 }; 2469 2470 bool inCharClass = false; 2471 do { 2472 int32_t unit = getCodeUnit(); 2473 if (unit == EOF) { 2474 ReportUnterminatedRegExp(unit); 2475 return badToken(); 2476 } 2477 2478 if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) { 2479 if (!ProcessNonAsciiCodePoint(unit)) { 2480 return badToken(); 2481 } 2482 2483 continue; 2484 } 2485 2486 if (unit == '\\') { 2487 if (!this->charBuffer.append(unit)) { 2488 return badToken(); 2489 } 2490 2491 unit = getCodeUnit(); 2492 if (unit == EOF) { 2493 ReportUnterminatedRegExp(unit); 2494 return badToken(); 2495 } 2496 2497 // Fallthrough only handles ASCII code points, so 2498 // deal with non-ASCII and skip everything else. 2499 if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) { 2500 if (!ProcessNonAsciiCodePoint(unit)) { 2501 return badToken(); 2502 } 2503 2504 continue; 2505 } 2506 } else if (unit == '[') { 2507 inCharClass = true; 2508 } else if (unit == ']') { 2509 inCharClass = false; 2510 } else if (unit == '/' && !inCharClass) { 2511 // For IE compat, allow unescaped / in char classes. 2512 break; 2513 } 2514 2515 // NOTE: Non-ASCII LineTerminators were handled by 2516 // ProcessNonAsciiCodePoint calls above. 2517 if (unit == '\r' || unit == '\n') { 2518 ReportUnterminatedRegExp(unit); 2519 return badToken(); 2520 } 2521 2522 MOZ_ASSERT(!IsLineTerminator(AssertedCast<char32_t>(unit))); 2523 if (!this->charBuffer.append(unit)) { 2524 return badToken(); 2525 } 2526 } while (true); 2527 2528 int32_t unit; 2529 RegExpFlags reflags = RegExpFlag::NoFlags; 2530 while (true) { 2531 uint8_t flag; 2532 unit = getCodeUnit(); 2533 if (unit == 'd') { 2534 flag = RegExpFlag::HasIndices; 2535 } else if (unit == 'g') { 2536 flag = RegExpFlag::Global; 2537 } else if (unit == 'i') { 2538 flag = RegExpFlag::IgnoreCase; 2539 } else if (unit == 'm') { 2540 flag = RegExpFlag::Multiline; 2541 } else if (unit == 's') { 2542 flag = RegExpFlag::DotAll; 2543 } else if (unit == 'u') { 2544 flag = RegExpFlag::Unicode; 2545 } else if (unit == 'v') { 2546 flag = RegExpFlag::UnicodeSets; 2547 } else if (unit == 'y') { 2548 flag = RegExpFlag::Sticky; 2549 } else if (IsAsciiAlpha(unit)) { 2550 flag = RegExpFlag::NoFlags; 2551 } else { 2552 break; 2553 } 2554 2555 if ((reflags & flag) || flag == RegExpFlag::NoFlags) { 2556 ungetCodeUnit(unit); 2557 char buf[2] = {char(unit), '\0'}; 2558 error(JSMSG_BAD_REGEXP_FLAG, buf); 2559 return badToken(); 2560 } 2561 2562 // /u and /v flags are mutually exclusive. 2563 if (((reflags & RegExpFlag::Unicode) && (flag & RegExpFlag::UnicodeSets)) || 2564 ((reflags & RegExpFlag::UnicodeSets) && (flag & RegExpFlag::Unicode))) { 2565 ungetCodeUnit(unit); 2566 char buf[2] = {char(unit), '\0'}; 2567 error(JSMSG_BAD_REGEXP_FLAG, buf); 2568 return badToken(); 2569 } 2570 2571 reflags |= flag; 2572 } 2573 ungetCodeUnit(unit); 2574 2575 newRegExpToken(reflags, start, out); 2576 return true; 2577 } 2578 2579 template <typename Unit, class AnyCharsAccess> 2580 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::bigIntLiteral( 2581 TokenStart start, Modifier modifier, TokenKind* out) { 2582 MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == toUnit('n')); 2583 MOZ_ASSERT(this->sourceUnits.offset() > start.offset()); 2584 uint32_t length = this->sourceUnits.offset() - start.offset(); 2585 MOZ_ASSERT(length >= 2); 2586 this->charBuffer.clear(); 2587 mozilla::Range<const Unit> chars( 2588 this->sourceUnits.codeUnitPtrAt(start.offset()), length); 2589 for (uint32_t idx = 0; idx < length - 1; idx++) { 2590 int32_t unit = CodeUnitValue(chars[idx]); 2591 // Char buffer may start with a 0[bBoOxX] prefix, then follows with 2592 // binary, octal, decimal, or hex digits. Already checked by caller, as 2593 // the "n" indicating bigint comes at the end. 2594 MOZ_ASSERT(isAsciiCodePoint(unit)); 2595 // Skip over any separators. 2596 if (unit == '_') { 2597 continue; 2598 } 2599 if (!AppendCodePointToCharBuffer(this->charBuffer, unit)) { 2600 return false; 2601 } 2602 } 2603 newBigIntToken(start, modifier, out); 2604 return true; 2605 } 2606 2607 template <typename Unit, class AnyCharsAccess> 2608 void GeneralTokenStreamChars<Unit, 2609 AnyCharsAccess>::consumeOptionalHashbangComment() { 2610 MOZ_ASSERT(this->sourceUnits.atStart(), 2611 "HashBangComment can only appear immediately at the start of a " 2612 "Script or Module"); 2613 2614 // HashbangComment :: 2615 // #! SingleLineCommentChars_opt 2616 2617 if (!matchCodeUnit('#')) { 2618 // HashbangComment is optional at start of Script or Module. 2619 return; 2620 } 2621 2622 if (!matchCodeUnit('!')) { 2623 // # not followed by ! at start of Script or Module is an error, but normal 2624 // parsing code will handle that error just fine if we let it. 2625 ungetCodeUnit('#'); 2626 return; 2627 } 2628 2629 // This doesn't consume a concluding LineTerminator, and it stops consuming 2630 // just before any encoding error. The subsequent |getToken| call will call 2631 // |getTokenInternal| below which will handle these possibilities. 2632 this->sourceUnits.consumeRestOfSingleLineComment(); 2633 } 2634 2635 template <typename Unit, class AnyCharsAccess> 2636 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::getTokenInternal( 2637 TokenKind* const ttp, const Modifier modifier) { 2638 // Assume we'll fail: success cases will overwrite this. 2639 #ifdef DEBUG 2640 *ttp = TokenKind::Limit; 2641 #endif 2642 MOZ_MAKE_MEM_UNDEFINED(ttp, sizeof(*ttp)); 2643 2644 // This loop runs more than once only when whitespace or comments are 2645 // encountered. 2646 do { 2647 int32_t unit = peekCodeUnit(); 2648 if (MOZ_UNLIKELY(unit == EOF)) { 2649 MOZ_ASSERT(this->sourceUnits.atEnd()); 2650 anyCharsAccess().flags.isEOF = true; 2651 TokenStart start(this->sourceUnits, 0); 2652 newSimpleToken(TokenKind::Eof, start, modifier, ttp); 2653 return true; 2654 } 2655 2656 if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) { 2657 // Non-ASCII code points can only be identifiers or whitespace. It would 2658 // be nice to compute these *after* discarding whitespace, but IN A WORLD 2659 // where |unicode::IsSpace| requires consuming a variable number of code 2660 // units, it's easier to assume it's an identifier and maybe do a little 2661 // wasted work, than to unget and compute and reget if whitespace. 2662 TokenStart start(this->sourceUnits, 0); 2663 const Unit* identStart = this->sourceUnits.addressOfNextCodeUnit(); 2664 2665 PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint(); 2666 if (peeked.isNone()) { 2667 MOZ_ALWAYS_FALSE(getCodePoint()); 2668 return badToken(); 2669 } 2670 2671 char32_t cp = peeked.codePoint(); 2672 if (unicode::IsSpace(cp)) { 2673 this->sourceUnits.consumeKnownCodePoint(peeked); 2674 if (IsLineTerminator(cp)) { 2675 if (!updateLineInfoForEOL()) { 2676 return badToken(); 2677 } 2678 2679 anyCharsAccess().updateFlagsForEOL(); 2680 } 2681 2682 continue; 2683 } 2684 2685 static_assert(isAsciiCodePoint('$'), 2686 "IdentifierStart contains '$', but as " 2687 "!IsUnicodeIDStart('$'), ensure that '$' is never " 2688 "handled here"); 2689 static_assert(isAsciiCodePoint('_'), 2690 "IdentifierStart contains '_', but as " 2691 "!IsUnicodeIDStart('_'), ensure that '_' is never " 2692 "handled here"); 2693 2694 if (MOZ_LIKELY(unicode::IsUnicodeIDStart(cp))) { 2695 this->sourceUnits.consumeKnownCodePoint(peeked); 2696 MOZ_ASSERT(!IsLineTerminator(cp), 2697 "IdentifierStart must guarantee !IsLineTerminator " 2698 "or else we'll fail to maintain line-info/flags " 2699 "for EOL here"); 2700 2701 return identifierName(start, identStart, IdentifierEscapes::None, 2702 modifier, NameVisibility::Public, ttp); 2703 } 2704 2705 reportIllegalCharacter(cp); 2706 return badToken(); 2707 } // !isAsciiCodePoint(unit) 2708 2709 consumeKnownCodeUnit(unit); 2710 2711 // Get the token kind, based on the first char. The ordering of c1kind 2712 // comparison is based on the frequency of tokens in real code: 2713 // Parsemark (which represents typical JS code on the web) and the 2714 // Unreal demo (which represents asm.js code). 2715 // 2716 // Parsemark Unreal 2717 // OneChar 32.9% 39.7% 2718 // Space 25.0% 0.6% 2719 // Ident 19.2% 36.4% 2720 // Dec 7.2% 5.1% 2721 // String 7.9% 0.0% 2722 // EOL 1.7% 0.0% 2723 // ZeroDigit 0.4% 4.9% 2724 // Other 5.7% 13.3% 2725 // 2726 // The ordering is based mostly only Parsemark frequencies, with Unreal 2727 // frequencies used to break close categories (e.g. |Dec| and 2728 // |String|). |Other| is biggish, but no other token kind is common 2729 // enough for it to be worth adding extra values to FirstCharKind. 2730 FirstCharKind c1kind = FirstCharKind(firstCharKinds[unit]); 2731 2732 // Look for an unambiguous single-char token. 2733 // 2734 if (c1kind <= OneChar_Max) { 2735 TokenStart start(this->sourceUnits, -1); 2736 newSimpleToken(TokenKind(c1kind), start, modifier, ttp); 2737 return true; 2738 } 2739 2740 // Skip over non-EOL whitespace chars. 2741 // 2742 if (c1kind == Space) { 2743 continue; 2744 } 2745 2746 // Look for an identifier. 2747 // 2748 if (c1kind == Ident) { 2749 TokenStart start(this->sourceUnits, -1); 2750 return identifierName( 2751 start, this->sourceUnits.addressOfNextCodeUnit() - 1, 2752 IdentifierEscapes::None, modifier, NameVisibility::Public, ttp); 2753 } 2754 2755 // Look for a decimal number. 2756 // 2757 if (c1kind == Dec) { 2758 TokenStart start(this->sourceUnits, -1); 2759 const Unit* numStart = this->sourceUnits.addressOfNextCodeUnit() - 1; 2760 return decimalNumber(unit, start, numStart, modifier, ttp); 2761 } 2762 2763 // Look for a string or a template string. 2764 // 2765 if (c1kind == String) { 2766 return getStringOrTemplateToken(static_cast<char>(unit), modifier, ttp); 2767 } 2768 2769 // Skip over EOL chars, updating line state along the way. 2770 // 2771 if (c1kind == EOL) { 2772 if (unit == '\r') { 2773 matchLineTerminator('\n'); 2774 } 2775 2776 if (!updateLineInfoForEOL()) { 2777 return badToken(); 2778 } 2779 2780 anyCharsAccess().updateFlagsForEOL(); 2781 continue; 2782 } 2783 2784 // From a '0', look for a hexadecimal, binary, octal, or "noctal" (a 2785 // number starting with '0' that contains '8' or '9' and is treated as 2786 // decimal) number. 2787 // 2788 if (c1kind == ZeroDigit) { 2789 TokenStart start(this->sourceUnits, -1); 2790 int radix; 2791 bool isBigInt = false; 2792 const Unit* numStart; 2793 unit = getCodeUnit(); 2794 if (unit == 'x' || unit == 'X') { 2795 radix = 16; 2796 unit = getCodeUnit(); 2797 if (!IsAsciiHexDigit(unit)) { 2798 // NOTE: |unit| may be EOF here. 2799 ungetCodeUnit(unit); 2800 error(JSMSG_MISSING_HEXDIGITS); 2801 return badToken(); 2802 } 2803 2804 // one past the '0x' 2805 numStart = this->sourceUnits.addressOfNextCodeUnit() - 1; 2806 2807 if (!matchIntegerAfterFirstDigit(IsAsciiHexDigit, &unit)) { 2808 return badToken(); 2809 } 2810 } else if (unit == 'b' || unit == 'B') { 2811 radix = 2; 2812 unit = getCodeUnit(); 2813 if (!IsAsciiBinary(unit)) { 2814 // NOTE: |unit| may be EOF here. 2815 ungetCodeUnit(unit); 2816 error(JSMSG_MISSING_BINARY_DIGITS); 2817 return badToken(); 2818 } 2819 2820 // one past the '0b' 2821 numStart = this->sourceUnits.addressOfNextCodeUnit() - 1; 2822 2823 if (!matchIntegerAfterFirstDigit(IsAsciiBinary, &unit)) { 2824 return badToken(); 2825 } 2826 } else if (unit == 'o' || unit == 'O') { 2827 radix = 8; 2828 unit = getCodeUnit(); 2829 if (!IsAsciiOctal(unit)) { 2830 // NOTE: |unit| may be EOF here. 2831 ungetCodeUnit(unit); 2832 error(JSMSG_MISSING_OCTAL_DIGITS); 2833 return badToken(); 2834 } 2835 2836 // one past the '0o' 2837 numStart = this->sourceUnits.addressOfNextCodeUnit() - 1; 2838 2839 if (!matchIntegerAfterFirstDigit(IsAsciiOctal, &unit)) { 2840 return badToken(); 2841 } 2842 } else if (IsAsciiDigit(unit)) { 2843 // Reject octal literals that appear in strict mode code. 2844 if (!strictModeError(JSMSG_DEPRECATED_OCTAL_LITERAL)) { 2845 return badToken(); 2846 } 2847 2848 // The above test doesn't catch a few edge cases; see 2849 // |GeneralParser::maybeParseDirective|. Record the violation so that 2850 // that function can handle them. 2851 anyCharsAccess().setSawDeprecatedOctalLiteral(); 2852 2853 radix = 8; 2854 // one past the '0' 2855 numStart = this->sourceUnits.addressOfNextCodeUnit() - 1; 2856 2857 bool nonOctalDecimalIntegerLiteral = false; 2858 do { 2859 if (unit >= '8') { 2860 nonOctalDecimalIntegerLiteral = true; 2861 } 2862 unit = getCodeUnit(); 2863 } while (IsAsciiDigit(unit)); 2864 2865 if (unit == '_') { 2866 ungetCodeUnit(unit); 2867 error(JSMSG_SEPARATOR_IN_ZERO_PREFIXED_NUMBER); 2868 return badToken(); 2869 } 2870 2871 if (unit == 'n') { 2872 ungetCodeUnit(unit); 2873 error(JSMSG_BIGINT_INVALID_SYNTAX); 2874 return badToken(); 2875 } 2876 2877 if (nonOctalDecimalIntegerLiteral) { 2878 // Use the decimal scanner for the rest of the number. 2879 return decimalNumber(unit, start, numStart, modifier, ttp); 2880 } 2881 } else if (unit == '_') { 2882 // Give a more explicit error message when '_' is used after '0'. 2883 ungetCodeUnit(unit); 2884 error(JSMSG_SEPARATOR_IN_ZERO_PREFIXED_NUMBER); 2885 return badToken(); 2886 } else { 2887 // '0' not followed by [XxBbOo0-9_]; scan as a decimal number. 2888 ungetCodeUnit(unit); 2889 numStart = this->sourceUnits.addressOfNextCodeUnit() - 1; // The '0'. 2890 return decimalNumber('0', start, numStart, modifier, ttp); 2891 } 2892 2893 if (unit == 'n') { 2894 isBigInt = true; 2895 unit = peekCodeUnit(); 2896 } else { 2897 ungetCodeUnit(unit); 2898 } 2899 2900 // Error if an identifier-start code point appears immediately 2901 // after the number. Somewhat surprisingly, if we don't check 2902 // here, we'll never check at all. 2903 if (MOZ_LIKELY(isAsciiCodePoint(unit))) { 2904 if (unicode::IsIdentifierStart(char16_t(unit))) { 2905 error(JSMSG_IDSTART_AFTER_NUMBER); 2906 return badToken(); 2907 } 2908 } else if (MOZ_LIKELY(unit != EOF)) { 2909 // This ignores encoding errors: subsequent caller-side code to 2910 // handle source text after the number will do so. 2911 PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint(); 2912 if (!peeked.isNone() && 2913 unicode::IsIdentifierStart(peeked.codePoint())) { 2914 error(JSMSG_IDSTART_AFTER_NUMBER); 2915 return badToken(); 2916 } 2917 } 2918 2919 if (isBigInt) { 2920 return bigIntLiteral(start, modifier, ttp); 2921 } 2922 2923 double dval; 2924 if (!GetFullInteger(numStart, this->sourceUnits.addressOfNextCodeUnit(), 2925 radix, IntegerSeparatorHandling::SkipUnderscore, 2926 &dval)) { 2927 ReportOutOfMemory(this->fc); 2928 return badToken(); 2929 } 2930 newNumberToken(dval, NoDecimal, start, modifier, ttp); 2931 return true; 2932 } 2933 2934 MOZ_ASSERT(c1kind == Other); 2935 2936 // This handles everything else. Simple tokens distinguished solely by 2937 // TokenKind should set |simpleKind| and break, to share simple-token 2938 // creation code for all such tokens. All other tokens must be handled 2939 // by returning (or by continuing from the loop enclosing this). 2940 // 2941 TokenStart start(this->sourceUnits, -1); 2942 TokenKind simpleKind; 2943 #ifdef DEBUG 2944 simpleKind = TokenKind::Limit; // sentinel value for code after switch 2945 #endif 2946 2947 // The block a ways above eliminated all non-ASCII, so cast to the 2948 // smallest type possible to assist the C++ compiler. 2949 switch (AssertedCast<uint8_t>(CodeUnitValue(toUnit(unit)))) { 2950 case '.': 2951 if (IsAsciiDigit(peekCodeUnit())) { 2952 return decimalNumber('.', start, 2953 this->sourceUnits.addressOfNextCodeUnit() - 1, 2954 modifier, ttp); 2955 } 2956 2957 unit = getCodeUnit(); 2958 if (unit == '.') { 2959 if (matchCodeUnit('.')) { 2960 simpleKind = TokenKind::TripleDot; 2961 break; 2962 } 2963 } 2964 2965 // NOTE: |unit| may be EOF here. A stray '.' at EOF would be an 2966 // error, but subsequent code will handle it. 2967 ungetCodeUnit(unit); 2968 2969 simpleKind = TokenKind::Dot; 2970 break; 2971 2972 case '#': { 2973 TokenStart start(this->sourceUnits, -1); 2974 const Unit* identStart = this->sourceUnits.addressOfNextCodeUnit() - 1; 2975 IdentifierEscapes sawEscape; 2976 if (!matchIdentifierStart(&sawEscape)) { 2977 return badToken(); 2978 } 2979 return identifierName(start, identStart, sawEscape, modifier, 2980 NameVisibility::Private, ttp); 2981 } 2982 2983 case '=': 2984 if (matchCodeUnit('=')) { 2985 simpleKind = matchCodeUnit('=') ? TokenKind::StrictEq : TokenKind::Eq; 2986 } else if (matchCodeUnit('>')) { 2987 simpleKind = TokenKind::Arrow; 2988 } else { 2989 simpleKind = TokenKind::Assign; 2990 } 2991 break; 2992 2993 case '+': 2994 if (matchCodeUnit('+')) { 2995 simpleKind = TokenKind::Inc; 2996 } else { 2997 simpleKind = 2998 matchCodeUnit('=') ? TokenKind::AddAssign : TokenKind::Add; 2999 } 3000 break; 3001 3002 case '\\': { 3003 char32_t codePoint; 3004 if (uint32_t escapeLength = matchUnicodeEscapeIdStart(&codePoint)) { 3005 return identifierName( 3006 start, 3007 this->sourceUnits.addressOfNextCodeUnit() - escapeLength - 1, 3008 IdentifierEscapes::SawUnicodeEscape, modifier, 3009 NameVisibility::Public, ttp); 3010 } 3011 3012 // We could point "into" a mistyped escape, e.g. for "\u{41H}" we 3013 // could point at the 'H'. But we don't do that now, so the code 3014 // unit after the '\' isn't necessarily bad, so just point at the 3015 // start of the actually-invalid escape. 3016 ungetCodeUnit('\\'); 3017 error(JSMSG_BAD_ESCAPE); 3018 return badToken(); 3019 } 3020 3021 case '|': 3022 if (matchCodeUnit('|')) { 3023 simpleKind = matchCodeUnit('=') ? TokenKind::OrAssign : TokenKind::Or; 3024 } else { 3025 simpleKind = 3026 matchCodeUnit('=') ? TokenKind::BitOrAssign : TokenKind::BitOr; 3027 } 3028 break; 3029 3030 case '^': 3031 simpleKind = 3032 matchCodeUnit('=') ? TokenKind::BitXorAssign : TokenKind::BitXor; 3033 break; 3034 3035 case '&': 3036 if (matchCodeUnit('&')) { 3037 simpleKind = 3038 matchCodeUnit('=') ? TokenKind::AndAssign : TokenKind::And; 3039 } else { 3040 simpleKind = 3041 matchCodeUnit('=') ? TokenKind::BitAndAssign : TokenKind::BitAnd; 3042 } 3043 break; 3044 3045 case '?': 3046 if (matchCodeUnit('.')) { 3047 unit = getCodeUnit(); 3048 if (IsAsciiDigit(unit)) { 3049 // if the code unit is followed by a number, for example it has the 3050 // following form `<...> ?.5 <..> then it should be treated as a 3051 // ternary rather than as an optional chain 3052 simpleKind = TokenKind::Hook; 3053 ungetCodeUnit(unit); 3054 ungetCodeUnit('.'); 3055 } else { 3056 ungetCodeUnit(unit); 3057 simpleKind = TokenKind::OptionalChain; 3058 } 3059 } else if (matchCodeUnit('?')) { 3060 simpleKind = matchCodeUnit('=') ? TokenKind::CoalesceAssign 3061 : TokenKind::Coalesce; 3062 } else { 3063 simpleKind = TokenKind::Hook; 3064 } 3065 break; 3066 3067 case '!': 3068 if (matchCodeUnit('=')) { 3069 simpleKind = matchCodeUnit('=') ? TokenKind::StrictNe : TokenKind::Ne; 3070 } else { 3071 simpleKind = TokenKind::Not; 3072 } 3073 break; 3074 3075 case '<': 3076 if (anyCharsAccess().options().allowHTMLComments) { 3077 // Treat HTML begin-comment as comment-till-end-of-line. 3078 if (matchCodeUnit('!')) { 3079 if (matchCodeUnit('-')) { 3080 if (matchCodeUnit('-')) { 3081 this->sourceUnits.consumeRestOfSingleLineComment(); 3082 continue; 3083 } 3084 ungetCodeUnit('-'); 3085 } 3086 ungetCodeUnit('!'); 3087 } 3088 } 3089 if (matchCodeUnit('<')) { 3090 simpleKind = 3091 matchCodeUnit('=') ? TokenKind::LshAssign : TokenKind::Lsh; 3092 } else { 3093 simpleKind = matchCodeUnit('=') ? TokenKind::Le : TokenKind::Lt; 3094 } 3095 break; 3096 3097 case '>': 3098 if (matchCodeUnit('>')) { 3099 if (matchCodeUnit('>')) { 3100 simpleKind = 3101 matchCodeUnit('=') ? TokenKind::UrshAssign : TokenKind::Ursh; 3102 } else { 3103 simpleKind = 3104 matchCodeUnit('=') ? TokenKind::RshAssign : TokenKind::Rsh; 3105 } 3106 } else { 3107 simpleKind = matchCodeUnit('=') ? TokenKind::Ge : TokenKind::Gt; 3108 } 3109 break; 3110 3111 case '*': 3112 if (matchCodeUnit('*')) { 3113 simpleKind = 3114 matchCodeUnit('=') ? TokenKind::PowAssign : TokenKind::Pow; 3115 } else { 3116 simpleKind = 3117 matchCodeUnit('=') ? TokenKind::MulAssign : TokenKind::Mul; 3118 } 3119 break; 3120 3121 case '/': 3122 // Look for a single-line comment. 3123 if (matchCodeUnit('/')) { 3124 unit = getCodeUnit(); 3125 if (unit == '@' || unit == '#') { 3126 bool shouldWarn = unit == '@'; 3127 if (!getDirectives(false, shouldWarn)) { 3128 return false; 3129 } 3130 } else { 3131 // NOTE: |unit| may be EOF here. 3132 ungetCodeUnit(unit); 3133 } 3134 3135 this->sourceUnits.consumeRestOfSingleLineComment(); 3136 continue; 3137 } 3138 3139 // Look for a multi-line comment. 3140 if (matchCodeUnit('*')) { 3141 TokenStreamAnyChars& anyChars = anyCharsAccess(); 3142 unsigned linenoBefore = anyChars.lineno; 3143 3144 do { 3145 int32_t unit = getCodeUnit(); 3146 if (unit == EOF) { 3147 error(JSMSG_UNTERMINATED_COMMENT); 3148 return badToken(); 3149 } 3150 3151 if (unit == '*' && matchCodeUnit('/')) { 3152 break; 3153 } 3154 3155 if (unit == '@' || unit == '#') { 3156 bool shouldWarn = unit == '@'; 3157 if (!getDirectives(true, shouldWarn)) { 3158 return badToken(); 3159 } 3160 } else if (MOZ_LIKELY(isAsciiCodePoint(unit))) { 3161 if (!getFullAsciiCodePoint(unit)) { 3162 return badToken(); 3163 } 3164 } else { 3165 char32_t codePoint; 3166 if (!getNonAsciiCodePoint(unit, &codePoint)) { 3167 return badToken(); 3168 } 3169 } 3170 } while (true); 3171 3172 if (linenoBefore != anyChars.lineno) { 3173 anyChars.updateFlagsForEOL(); 3174 } 3175 3176 continue; 3177 } 3178 3179 // Look for a regexp. 3180 if (modifier == SlashIsRegExp) { 3181 return regexpLiteral(start, ttp); 3182 } 3183 3184 simpleKind = matchCodeUnit('=') ? TokenKind::DivAssign : TokenKind::Div; 3185 break; 3186 3187 case '%': 3188 simpleKind = matchCodeUnit('=') ? TokenKind::ModAssign : TokenKind::Mod; 3189 break; 3190 3191 case '-': 3192 if (matchCodeUnit('-')) { 3193 if (anyCharsAccess().options().allowHTMLComments && 3194 !anyCharsAccess().flags.isDirtyLine) { 3195 if (matchCodeUnit('>')) { 3196 this->sourceUnits.consumeRestOfSingleLineComment(); 3197 continue; 3198 } 3199 } 3200 3201 simpleKind = TokenKind::Dec; 3202 } else { 3203 simpleKind = 3204 matchCodeUnit('=') ? TokenKind::SubAssign : TokenKind::Sub; 3205 } 3206 break; 3207 3208 #ifdef ENABLE_DECORATORS 3209 case '@': 3210 simpleKind = TokenKind::At; 3211 break; 3212 #endif 3213 3214 default: 3215 // We consumed a bad ASCII code point/unit. Put it back so the 3216 // error location is the bad code point. 3217 ungetCodeUnit(unit); 3218 reportIllegalCharacter(unit); 3219 return badToken(); 3220 } // switch (AssertedCast<uint8_t>(CodeUnitValue(toUnit(unit)))) 3221 3222 MOZ_ASSERT(simpleKind != TokenKind::Limit, 3223 "switch-statement should have set |simpleKind| before " 3224 "breaking"); 3225 3226 newSimpleToken(simpleKind, start, modifier, ttp); 3227 return true; 3228 } while (true); 3229 } 3230 3231 template <typename Unit, class AnyCharsAccess> 3232 bool TokenStreamSpecific<Unit, AnyCharsAccess>::getStringOrTemplateToken( 3233 char untilChar, Modifier modifier, TokenKind* out) { 3234 MOZ_ASSERT(untilChar == '\'' || untilChar == '"' || untilChar == '`', 3235 "unexpected string/template literal delimiter"); 3236 3237 bool parsingTemplate = (untilChar == '`'); 3238 bool templateHead = false; 3239 3240 TokenStart start(this->sourceUnits, -1); 3241 this->charBuffer.clear(); 3242 3243 // Run the bad-token code for every path out of this function except the 3244 // one success-case. 3245 auto noteBadToken = MakeScopeExit([this]() { this->badToken(); }); 3246 3247 auto ReportPrematureEndOfLiteral = [this, untilChar](unsigned errnum) { 3248 // Unicode separators aren't end-of-line in template or (as of 3249 // recently) string literals, so this assertion doesn't allow them. 3250 MOZ_ASSERT(this->sourceUnits.atEnd() || 3251 this->sourceUnits.peekCodeUnit() == Unit('\r') || 3252 this->sourceUnits.peekCodeUnit() == Unit('\n'), 3253 "must be parked at EOF or EOL to call this function"); 3254 3255 // The various errors reported here include language like "in a '' 3256 // literal" or similar, with '' being '', "", or `` as appropriate. 3257 const char delimiters[] = {untilChar, untilChar, '\0'}; 3258 3259 this->error(errnum, delimiters); 3260 return; 3261 }; 3262 3263 // We need to detect any of these chars: " or ', \n (or its 3264 // equivalents), \\, EOF. Because we detect EOL sequences here and 3265 // put them back immediately, we can use getCodeUnit(). 3266 int32_t unit; 3267 while ((unit = getCodeUnit()) != untilChar) { 3268 if (unit == EOF) { 3269 ReportPrematureEndOfLiteral(JSMSG_EOF_BEFORE_END_OF_LITERAL); 3270 return false; 3271 } 3272 3273 // Non-ASCII code points are always directly appended -- even 3274 // U+2028 LINE SEPARATOR and U+2029 PARAGRAPH SEPARATOR that are 3275 // ordinarily LineTerminatorSequences. (They contribute their literal 3276 // values to template and [as of recently] string literals, but they're 3277 // line terminators when computing line/column coordinates.) Handle 3278 // the non-ASCII case early for readability. 3279 if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) { 3280 char32_t cp; 3281 if (!getNonAsciiCodePointDontNormalize(toUnit(unit), &cp)) { 3282 return false; 3283 } 3284 3285 if (MOZ_UNLIKELY(cp == unicode::LINE_SEPARATOR || 3286 cp == unicode::PARA_SEPARATOR)) { 3287 if (!updateLineInfoForEOL()) { 3288 return false; 3289 } 3290 3291 anyCharsAccess().updateFlagsForEOL(); 3292 } else { 3293 MOZ_ASSERT(!IsLineTerminator(cp)); 3294 } 3295 3296 if (!AppendCodePointToCharBuffer(this->charBuffer, cp)) { 3297 return false; 3298 } 3299 3300 continue; 3301 } 3302 3303 if (unit == '\\') { 3304 // When parsing templates, we don't immediately report errors for 3305 // invalid escapes; these are handled by the parser. We don't 3306 // append to charBuffer in those cases because it won't be read. 3307 unit = getCodeUnit(); 3308 if (unit == EOF) { 3309 ReportPrematureEndOfLiteral(JSMSG_EOF_IN_ESCAPE_IN_LITERAL); 3310 return false; 3311 } 3312 3313 // Non-ASCII |unit| isn't handled by code after this, so dedicate 3314 // an unlikely special-case to it and then continue. 3315 if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) { 3316 char32_t codePoint; 3317 if (!getNonAsciiCodePoint(unit, &codePoint)) { 3318 return false; 3319 } 3320 3321 // If we consumed U+2028 LINE SEPARATOR or U+2029 PARAGRAPH 3322 // SEPARATOR, they'll be normalized to '\n'. '\' followed by 3323 // LineContinuation represents no code points, so don't append 3324 // in this case. 3325 if (codePoint != '\n') { 3326 if (!AppendCodePointToCharBuffer(this->charBuffer, codePoint)) { 3327 return false; 3328 } 3329 } 3330 3331 continue; 3332 } 3333 3334 // The block above eliminated all non-ASCII, so cast to the 3335 // smallest type possible to assist the C++ compiler. 3336 switch (AssertedCast<uint8_t>(CodeUnitValue(toUnit(unit)))) { 3337 case 'b': 3338 unit = '\b'; 3339 break; 3340 case 'f': 3341 unit = '\f'; 3342 break; 3343 case 'n': 3344 unit = '\n'; 3345 break; 3346 case 'r': 3347 unit = '\r'; 3348 break; 3349 case 't': 3350 unit = '\t'; 3351 break; 3352 case 'v': 3353 unit = '\v'; 3354 break; 3355 3356 case '\r': 3357 matchLineTerminator('\n'); 3358 [[fallthrough]]; 3359 case '\n': { 3360 // LineContinuation represents no code points. We're manually 3361 // consuming a LineTerminatorSequence, so we must manually 3362 // update line/column info. 3363 if (!updateLineInfoForEOL()) { 3364 return false; 3365 } 3366 3367 continue; 3368 } 3369 3370 // Unicode character specification. 3371 case 'u': { 3372 int32_t c2 = getCodeUnit(); 3373 if (c2 == EOF) { 3374 ReportPrematureEndOfLiteral(JSMSG_EOF_IN_ESCAPE_IN_LITERAL); 3375 return false; 3376 } 3377 3378 // First handle a delimited Unicode escape, e.g. \u{1F4A9}. 3379 if (c2 == '{') { 3380 uint32_t start = this->sourceUnits.offset() - 3; 3381 uint32_t code = 0; 3382 bool first = true; 3383 bool valid = true; 3384 do { 3385 int32_t u3 = getCodeUnit(); 3386 if (u3 == EOF) { 3387 if (parsingTemplate) { 3388 TokenStreamAnyChars& anyChars = anyCharsAccess(); 3389 anyChars.setInvalidTemplateEscape(start, 3390 InvalidEscapeType::Unicode); 3391 valid = false; 3392 break; 3393 } 3394 reportInvalidEscapeError(start, InvalidEscapeType::Unicode); 3395 return false; 3396 } 3397 if (u3 == '}') { 3398 if (first) { 3399 if (parsingTemplate) { 3400 TokenStreamAnyChars& anyChars = anyCharsAccess(); 3401 anyChars.setInvalidTemplateEscape( 3402 start, InvalidEscapeType::Unicode); 3403 valid = false; 3404 break; 3405 } 3406 reportInvalidEscapeError(start, InvalidEscapeType::Unicode); 3407 return false; 3408 } 3409 break; 3410 } 3411 3412 // Beware: |u3| may be a non-ASCII code point here; if 3413 // so it'll pass into this |if|-block. 3414 if (!IsAsciiHexDigit(u3)) { 3415 if (parsingTemplate) { 3416 // We put the code unit back so that we read it 3417 // on the next pass, which matters if it was 3418 // '`' or '\'. 3419 ungetCodeUnit(u3); 3420 3421 TokenStreamAnyChars& anyChars = anyCharsAccess(); 3422 anyChars.setInvalidTemplateEscape(start, 3423 InvalidEscapeType::Unicode); 3424 valid = false; 3425 break; 3426 } 3427 reportInvalidEscapeError(start, InvalidEscapeType::Unicode); 3428 return false; 3429 } 3430 3431 code = (code << 4) | AsciiAlphanumericToNumber(u3); 3432 if (code > unicode::NonBMPMax) { 3433 if (parsingTemplate) { 3434 TokenStreamAnyChars& anyChars = anyCharsAccess(); 3435 anyChars.setInvalidTemplateEscape( 3436 start + 3, InvalidEscapeType::UnicodeOverflow); 3437 valid = false; 3438 break; 3439 } 3440 reportInvalidEscapeError(start + 3, 3441 InvalidEscapeType::UnicodeOverflow); 3442 return false; 3443 } 3444 3445 first = false; 3446 } while (true); 3447 3448 if (!valid) { 3449 continue; 3450 } 3451 3452 MOZ_ASSERT(code <= unicode::NonBMPMax); 3453 if (!AppendCodePointToCharBuffer(this->charBuffer, code)) { 3454 return false; 3455 } 3456 3457 continue; 3458 } // end of delimited Unicode escape handling 3459 3460 // Otherwise it must be a fixed-length \uXXXX Unicode escape. 3461 // If it isn't, this is usually an error -- but if this is a 3462 // template literal, we must defer error reporting because 3463 // malformed escapes are okay in *tagged* template literals. 3464 char16_t v; 3465 if (IsAsciiHexDigit(c2) && this->sourceUnits.matchHexDigits(3, &v)) { 3466 unit = (AsciiAlphanumericToNumber(c2) << 12) | v; 3467 } else { 3468 // Beware: |c2| may not be an ASCII code point here! 3469 ungetCodeUnit(c2); 3470 uint32_t start = this->sourceUnits.offset() - 2; 3471 if (parsingTemplate) { 3472 TokenStreamAnyChars& anyChars = anyCharsAccess(); 3473 anyChars.setInvalidTemplateEscape(start, 3474 InvalidEscapeType::Unicode); 3475 continue; 3476 } 3477 reportInvalidEscapeError(start, InvalidEscapeType::Unicode); 3478 return false; 3479 } 3480 break; 3481 } // case 'u' 3482 3483 // Hexadecimal character specification. 3484 case 'x': { 3485 char16_t v; 3486 if (this->sourceUnits.matchHexDigits(2, &v)) { 3487 unit = v; 3488 } else { 3489 uint32_t start = this->sourceUnits.offset() - 2; 3490 if (parsingTemplate) { 3491 TokenStreamAnyChars& anyChars = anyCharsAccess(); 3492 anyChars.setInvalidTemplateEscape(start, 3493 InvalidEscapeType::Hexadecimal); 3494 continue; 3495 } 3496 reportInvalidEscapeError(start, InvalidEscapeType::Hexadecimal); 3497 return false; 3498 } 3499 break; 3500 } 3501 3502 default: { 3503 if (!IsAsciiOctal(unit)) { 3504 // \8 or \9 in an untagged template literal is a syntax error, 3505 // reported in GeneralParser::noSubstitutionUntaggedTemplate. 3506 // 3507 // Tagged template literals, however, may contain \8 and \9. The 3508 // "cooked" representation of such a part will be |undefined|, and 3509 // the "raw" representation will contain the literal characters. 3510 // 3511 // function f(parts) { 3512 // assertEq(parts[0], undefined); 3513 // assertEq(parts.raw[0], "\\8"); 3514 // return "composed"; 3515 // } 3516 // assertEq(f`\8`, "composed"); 3517 if (unit == '8' || unit == '9') { 3518 TokenStreamAnyChars& anyChars = anyCharsAccess(); 3519 if (parsingTemplate) { 3520 anyChars.setInvalidTemplateEscape( 3521 this->sourceUnits.offset() - 2, 3522 InvalidEscapeType::EightOrNine); 3523 continue; 3524 } 3525 3526 // \8 and \9 are forbidden in string literals in strict mode code. 3527 if (!strictModeError(JSMSG_DEPRECATED_EIGHT_OR_NINE_ESCAPE)) { 3528 return false; 3529 } 3530 3531 // The above test doesn't catch a few edge cases; see 3532 // |GeneralParser::maybeParseDirective|. Record the violation so 3533 // that that function can handle them. 3534 anyChars.setSawDeprecatedEightOrNineEscape(); 3535 } 3536 break; 3537 } 3538 3539 // Octal character specification. 3540 int32_t val = AsciiOctalToNumber(unit); 3541 3542 unit = peekCodeUnit(); 3543 if (MOZ_UNLIKELY(unit == EOF)) { 3544 ReportPrematureEndOfLiteral(JSMSG_EOF_IN_ESCAPE_IN_LITERAL); 3545 return false; 3546 } 3547 3548 // Strict mode code allows only \0 followed by a non-digit. 3549 if (val != 0 || IsAsciiDigit(unit)) { 3550 TokenStreamAnyChars& anyChars = anyCharsAccess(); 3551 if (parsingTemplate) { 3552 anyChars.setInvalidTemplateEscape(this->sourceUnits.offset() - 2, 3553 InvalidEscapeType::Octal); 3554 continue; 3555 } 3556 3557 if (!strictModeError(JSMSG_DEPRECATED_OCTAL_ESCAPE)) { 3558 return false; 3559 } 3560 3561 // The above test doesn't catch a few edge cases; see 3562 // |GeneralParser::maybeParseDirective|. Record the violation so 3563 // that that function can handle them. 3564 anyChars.setSawDeprecatedOctalEscape(); 3565 } 3566 3567 if (IsAsciiOctal(unit)) { 3568 val = 8 * val + AsciiOctalToNumber(unit); 3569 consumeKnownCodeUnit(unit); 3570 3571 unit = peekCodeUnit(); 3572 if (MOZ_UNLIKELY(unit == EOF)) { 3573 ReportPrematureEndOfLiteral(JSMSG_EOF_IN_ESCAPE_IN_LITERAL); 3574 return false; 3575 } 3576 3577 if (IsAsciiOctal(unit)) { 3578 int32_t save = val; 3579 val = 8 * val + AsciiOctalToNumber(unit); 3580 if (val <= 0xFF) { 3581 consumeKnownCodeUnit(unit); 3582 } else { 3583 val = save; 3584 } 3585 } 3586 } 3587 3588 unit = char16_t(val); 3589 break; 3590 } // default 3591 } // switch (AssertedCast<uint8_t>(CodeUnitValue(toUnit(unit)))) 3592 3593 if (!this->charBuffer.append(unit)) { 3594 return false; 3595 } 3596 3597 continue; 3598 } // (unit == '\\') 3599 3600 if (unit == '\r' || unit == '\n') { 3601 if (!parsingTemplate) { 3602 // String literals don't allow ASCII line breaks. 3603 ungetCodeUnit(unit); 3604 ReportPrematureEndOfLiteral(JSMSG_EOL_BEFORE_END_OF_STRING); 3605 return false; 3606 } 3607 3608 if (unit == '\r') { 3609 unit = '\n'; 3610 matchLineTerminator('\n'); 3611 } 3612 3613 if (!updateLineInfoForEOL()) { 3614 return false; 3615 } 3616 3617 anyCharsAccess().updateFlagsForEOL(); 3618 } else if (parsingTemplate && unit == '$' && matchCodeUnit('{')) { 3619 templateHead = true; 3620 break; 3621 } 3622 3623 if (!this->charBuffer.append(unit)) { 3624 return false; 3625 } 3626 } 3627 3628 TaggedParserAtomIndex atom = drainCharBufferIntoAtom(); 3629 if (!atom) { 3630 return false; 3631 } 3632 3633 noteBadToken.release(); 3634 3635 MOZ_ASSERT_IF(!parsingTemplate, !templateHead); 3636 3637 TokenKind kind = !parsingTemplate ? TokenKind::String 3638 : templateHead ? TokenKind::TemplateHead 3639 : TokenKind::NoSubsTemplate; 3640 newAtomToken(kind, atom, start, modifier, out); 3641 return true; 3642 } 3643 3644 const char* TokenKindToDesc(TokenKind tt) { 3645 switch (tt) { 3646 #define EMIT_CASE(name, desc) \ 3647 case TokenKind::name: \ 3648 return desc; 3649 FOR_EACH_TOKEN_KIND(EMIT_CASE) 3650 #undef EMIT_CASE 3651 case TokenKind::Limit: 3652 MOZ_ASSERT_UNREACHABLE("TokenKind::Limit should not be passed."); 3653 break; 3654 } 3655 3656 return "<bad TokenKind>"; 3657 } 3658 3659 #ifdef DEBUG 3660 const char* TokenKindToString(TokenKind tt) { 3661 switch (tt) { 3662 # define EMIT_CASE(name, desc) \ 3663 case TokenKind::name: \ 3664 return "TokenKind::" #name; 3665 FOR_EACH_TOKEN_KIND(EMIT_CASE) 3666 # undef EMIT_CASE 3667 case TokenKind::Limit: 3668 break; 3669 } 3670 3671 return "<bad TokenKind>"; 3672 } 3673 #endif 3674 3675 template class TokenStreamCharsBase<Utf8Unit>; 3676 template class TokenStreamCharsBase<char16_t>; 3677 3678 template class GeneralTokenStreamChars<char16_t, TokenStreamAnyCharsAccess>; 3679 template class TokenStreamChars<char16_t, TokenStreamAnyCharsAccess>; 3680 template class TokenStreamSpecific<char16_t, TokenStreamAnyCharsAccess>; 3681 3682 template class GeneralTokenStreamChars< 3683 Utf8Unit, ParserAnyCharsAccess<GeneralParser<FullParseHandler, Utf8Unit>>>; 3684 template class GeneralTokenStreamChars< 3685 Utf8Unit, 3686 ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, Utf8Unit>>>; 3687 template class GeneralTokenStreamChars< 3688 char16_t, ParserAnyCharsAccess<GeneralParser<FullParseHandler, char16_t>>>; 3689 template class GeneralTokenStreamChars< 3690 char16_t, 3691 ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, char16_t>>>; 3692 3693 template class TokenStreamChars< 3694 Utf8Unit, ParserAnyCharsAccess<GeneralParser<FullParseHandler, Utf8Unit>>>; 3695 template class TokenStreamChars< 3696 Utf8Unit, 3697 ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, Utf8Unit>>>; 3698 template class TokenStreamChars< 3699 char16_t, ParserAnyCharsAccess<GeneralParser<FullParseHandler, char16_t>>>; 3700 template class TokenStreamChars< 3701 char16_t, 3702 ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, char16_t>>>; 3703 3704 template class TokenStreamSpecific< 3705 Utf8Unit, ParserAnyCharsAccess<GeneralParser<FullParseHandler, Utf8Unit>>>; 3706 template class TokenStreamSpecific< 3707 Utf8Unit, 3708 ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, Utf8Unit>>>; 3709 template class TokenStreamSpecific< 3710 char16_t, ParserAnyCharsAccess<GeneralParser<FullParseHandler, char16_t>>>; 3711 template class TokenStreamSpecific< 3712 char16_t, 3713 ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, char16_t>>>; 3714 3715 } // namespace frontend 3716 3717 } // namespace js