nsHtml5StreamParser.cpp (106697B)
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* vim: set sw=2 ts=2 et tw=80: */ 3 /* This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #include "nsHtml5StreamParser.h" 8 9 #include <stdlib.h> 10 #include <string.h> 11 #include <utility> 12 #include "ErrorList.h" 13 #include "GeckoProfiler.h" 14 #include "js/GCAPI.h" 15 #include "mozilla/Buffer.h" 16 #include "mozilla/CheckedInt.h" 17 #include "mozilla/Encoding.h" 18 #include "mozilla/EncodingDetector.h" 19 #include "mozilla/Likely.h" 20 #include "mozilla/Maybe.h" 21 #include "mozilla/SchedulerGroup.h" 22 #include "mozilla/ScopeExit.h" 23 #include "mozilla/Services.h" 24 #include "mozilla/StaticPrefs_html5.h" 25 #include "mozilla/StaticPrefs_network.h" 26 #include "mozilla/TextUtils.h" 27 28 #include "mozilla/dom/BindingDeclarations.h" 29 #include "mozilla/dom/BrowsingContext.h" 30 #include "mozilla/dom/DebuggerUtilsBinding.h" 31 #include "mozilla/dom/Document.h" 32 #include "mozilla/Vector.h" 33 #include "nsContentSink.h" 34 #include "nsContentUtils.h" 35 #include "nsCycleCollectionTraversalCallback.h" 36 #include "nsHtml5AtomTable.h" 37 #include "nsHtml5Highlighter.h" 38 #include "nsHtml5Module.h" 39 #include "nsHtml5OwningUTF16Buffer.h" 40 #include "nsHtml5Parser.h" 41 #include "nsHtml5Speculation.h" 42 #include "nsHtml5StreamParserPtr.h" 43 #include "nsHtml5Tokenizer.h" 44 #include "nsHtml5TreeBuilder.h" 45 #include "nsHtml5TreeOpExecutor.h" 46 #include "nsIChannel.h" 47 #include "nsIContentSink.h" 48 #include "nsID.h" 49 #include "nsIDocShell.h" 50 #include "nsIHttpChannel.h" 51 #include "nsIInputStream.h" 52 #include "nsINestedURI.h" 53 #include "nsIObserverService.h" 54 #include "nsIRequest.h" 55 #include "nsIRunnable.h" 56 #include "nsIScriptError.h" 57 #include "nsIThread.h" 58 #include "nsIThreadRetargetableRequest.h" 59 #include "nsITimer.h" 60 #include "nsIURI.h" 61 #include "nsJSEnvironment.h" 62 #include "nsLiteralString.h" 63 #include "nsNetUtil.h" 64 #include "nsString.h" 65 #include "nsTPromiseFlatString.h" 66 #include "nsThreadUtils.h" 67 #include "nsXULAppAPI.h" 68 69 extern "C" { 70 // Defined in intl/encoding_glue/src/lib.rs 71 const mozilla::Encoding* xmldecl_parse(const uint8_t* buf, size_t buf_len); 72 }; 73 74 using namespace mozilla; 75 using namespace mozilla::dom; 76 77 /* 78 * Note that nsHtml5StreamParser implements cycle collecting AddRef and 79 * Release. Therefore, nsHtml5StreamParser must never be refcounted from 80 * the parser thread! 81 * 82 * To work around this limitation, runnables posted by the main thread to the 83 * parser thread hold their reference to the stream parser in an 84 * nsHtml5StreamParserPtr. Upon creation, nsHtml5StreamParserPtr addrefs the 85 * object it holds 86 * just like a regular nsRefPtr. This is OK, since the creation of the 87 * runnable and the nsHtml5StreamParserPtr happens on the main thread. 88 * 89 * When the runnable is done on the parser thread, the destructor of 90 * nsHtml5StreamParserPtr runs there. It doesn't call Release on the held object 91 * directly. Instead, it posts another runnable back to the main thread where 92 * that runnable calls Release on the wrapped object. 93 * 94 * When posting runnables in the other direction, the runnables have to be 95 * created on the main thread when nsHtml5StreamParser is instantiated and 96 * held for the lifetime of the nsHtml5StreamParser. This works, because the 97 * same runnabled can be dispatched multiple times and currently runnables 98 * posted from the parser thread to main thread don't need to wrap any 99 * runnable-specific data. (In the other direction, the runnables most notably 100 * wrap the byte data of the stream.) 101 */ 102 NS_IMPL_CYCLE_COLLECTING_ADDREF(nsHtml5StreamParser) 103 NS_IMPL_CYCLE_COLLECTING_RELEASE(nsHtml5StreamParser) 104 105 NS_INTERFACE_TABLE_HEAD(nsHtml5StreamParser) 106 NS_INTERFACE_TABLE(nsHtml5StreamParser, nsISupports) 107 NS_INTERFACE_TABLE_TO_MAP_SEGUE_CYCLE_COLLECTION(nsHtml5StreamParser) 108 NS_INTERFACE_MAP_END 109 110 NS_IMPL_CYCLE_COLLECTION_CLASS(nsHtml5StreamParser) 111 112 NS_IMPL_CYCLE_COLLECTION_UNLINK_BEGIN(nsHtml5StreamParser) 113 tmp->DropTimer(); 114 NS_IMPL_CYCLE_COLLECTION_UNLINK(mRequest) 115 NS_IMPL_CYCLE_COLLECTION_UNLINK(mOwner) 116 tmp->mExecutorFlusher = nullptr; 117 tmp->mLoadFlusher = nullptr; 118 tmp->mExecutor = nullptr; 119 NS_IMPL_CYCLE_COLLECTION_UNLINK_END 120 121 NS_IMPL_CYCLE_COLLECTION_TRAVERSE_BEGIN(nsHtml5StreamParser) 122 NS_IMPL_CYCLE_COLLECTION_TRAVERSE(mRequest) 123 NS_IMPL_CYCLE_COLLECTION_TRAVERSE(mOwner) 124 // hack: count the strongly owned edge wrapped in the runnable 125 if (tmp->mExecutorFlusher) { 126 NS_CYCLE_COLLECTION_NOTE_EDGE_NAME(cb, "mExecutorFlusher->mExecutor"); 127 cb.NoteXPCOMChild(static_cast<nsIContentSink*>(tmp->mExecutor)); 128 } 129 // hack: count the strongly owned edge wrapped in the runnable 130 if (tmp->mLoadFlusher) { 131 NS_CYCLE_COLLECTION_NOTE_EDGE_NAME(cb, "mLoadFlusher->mExecutor"); 132 cb.NoteXPCOMChild(static_cast<nsIContentSink*>(tmp->mExecutor)); 133 } 134 NS_IMPL_CYCLE_COLLECTION_TRAVERSE_END 135 136 class nsHtml5ExecutorFlusher : public Runnable { 137 private: 138 RefPtr<nsHtml5TreeOpExecutor> mExecutor; 139 140 public: 141 explicit nsHtml5ExecutorFlusher(nsHtml5TreeOpExecutor* aExecutor) 142 : Runnable("nsHtml5ExecutorFlusher"), mExecutor(aExecutor) {} 143 NS_IMETHOD Run() override { 144 if (!mExecutor->isInList()) { 145 Document* doc = mExecutor->GetDocument(); 146 if (XRE_IsContentProcess() && 147 nsContentUtils:: 148 HighPriorityEventPendingForTopLevelDocumentBeforeContentfulPaint( 149 doc)) { 150 // Possible early paint pending, reuse the runnable and try to 151 // call RunFlushLoop later. 152 nsCOMPtr<nsIRunnable> flusher = this; 153 if (NS_SUCCEEDED(doc->Dispatch(flusher.forget()))) { 154 PROFILER_MARKER_UNTYPED("HighPrio blocking parser flushing(1)", DOM); 155 return NS_OK; 156 } 157 } 158 mExecutor->RunFlushLoop(); 159 } 160 return NS_OK; 161 } 162 }; 163 164 class nsHtml5LoadFlusher : public Runnable { 165 private: 166 RefPtr<nsHtml5TreeOpExecutor> mExecutor; 167 168 public: 169 explicit nsHtml5LoadFlusher(nsHtml5TreeOpExecutor* aExecutor) 170 : Runnable("nsHtml5LoadFlusher"), mExecutor(aExecutor) {} 171 NS_IMETHOD Run() override { 172 // If we're in sync XHR, do nothing. We'll flush the speculative loads 173 // after the flush ends. 174 if (!mExecutor->IsFlushing()) { 175 mExecutor->FlushSpeculativeLoads(); 176 } 177 return NS_OK; 178 } 179 }; 180 181 nsHtml5StreamParser::nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor, 182 nsHtml5Parser* aOwner, 183 eParserMode aMode) 184 : mBomState(eBomState::BOM_SNIFFING_NOT_STARTED), 185 mCharsetSource(kCharsetUninitialized), 186 mEncodingSwitchSource(kCharsetUninitialized), 187 mEncoding(X_USER_DEFINED_ENCODING), // Obviously bogus value to notice if 188 // not updated 189 mNeedsEncodingSwitchTo(nullptr), 190 mSeenEligibleMetaCharset(false), 191 mChardetEof(false), 192 #ifdef DEBUG 193 mStartedFeedingDetector(false), 194 mStartedFeedingDevTools(false), 195 #endif 196 mReparseForbidden(false), 197 mForceAutoDetection(false), 198 mChannelHadCharset(false), 199 mLookingForMetaCharset(false), 200 mStartsWithLtQuestion(false), 201 mLookingForXmlDeclarationForXmlViewSource(false), 202 mTemplatePushedOrHeadPopped(false), 203 mGtBuffer(nullptr), 204 mGtPos(0), 205 mLastBuffer(nullptr), // Will be filled when starting 206 mExecutor(aExecutor), 207 mTreeBuilder(new nsHtml5TreeBuilder( 208 (aMode == VIEW_SOURCE_HTML || aMode == VIEW_SOURCE_XML) 209 ? nullptr 210 : mExecutor->GetStage(), 211 mExecutor->GetStage(), aMode == NORMAL)), 212 mTokenizer( 213 new nsHtml5Tokenizer(mTreeBuilder.get(), aMode == VIEW_SOURCE_XML)), 214 mTokenizerMutex("nsHtml5StreamParser mTokenizerMutex"), 215 mOwner(aOwner), 216 mLastWasCR(false), 217 mStreamState(eHtml5StreamState::STREAM_NOT_STARTED), 218 mSpeculating(false), 219 mAtEOF(false), 220 mSpeculationMutex("nsHtml5StreamParser mSpeculationMutex"), 221 mSpeculationFailureCount(0), 222 mNumBytesBuffered(0), 223 mTerminated(false), 224 mInterrupted(false), 225 mEventTarget(nsHtml5Module::GetStreamParserEventTarget()), 226 mExecutorFlusher(new nsHtml5ExecutorFlusher(aExecutor)), 227 mLoadFlusher(new nsHtml5LoadFlusher(aExecutor)), 228 mInitialEncodingWasFromParentFrame(false), 229 mHasHadErrors(false), 230 mDetectorHasSeenNonAscii(false), 231 mDecodingLocalFileWithoutTokenizing(false), 232 mBufferingBytes(false), 233 mFlushTimer(NS_NewTimer(mEventTarget)), 234 mFlushTimerMutex("nsHtml5StreamParser mFlushTimerMutex"), 235 mFlushTimerArmed(false), 236 mFlushTimerEverFired(false), 237 mMode(aMode), 238 mBrowserIdForDevtools(0), 239 mBrowsingContextIDForDevtools(0) { 240 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!"); 241 #ifdef DEBUG 242 mAtomTable.SetPermittedLookupEventTarget(mEventTarget); 243 #endif 244 mTokenizer->setInterner(&mAtomTable); 245 mTokenizer->setEncodingDeclarationHandler(this); 246 247 if (aMode == VIEW_SOURCE_HTML || aMode == VIEW_SOURCE_XML) { 248 nsHtml5Highlighter* highlighter = 249 new nsHtml5Highlighter(mExecutor->GetStage()); 250 mTokenizer->EnableViewSource(highlighter); // takes ownership 251 mTreeBuilder->EnableViewSource(highlighter); // doesn't own 252 } 253 254 // There's a zeroing operator new for everything else 255 } 256 257 nsHtml5StreamParser::~nsHtml5StreamParser() { 258 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!"); 259 mTokenizer->end(); 260 #ifdef DEBUG 261 { 262 mozilla::MutexAutoLock flushTimerLock(mFlushTimerMutex); 263 MOZ_ASSERT(!mFlushTimer, "Flush timer was not dropped before dtor!"); 264 } 265 mRequest = nullptr; 266 mUnicodeDecoder = nullptr; 267 mFirstBuffer = nullptr; 268 mExecutor = nullptr; 269 mTreeBuilder = nullptr; 270 mTokenizer = nullptr; 271 mOwner = nullptr; 272 #endif 273 } 274 275 nsresult nsHtml5StreamParser::GetChannel(nsIChannel** aChannel) { 276 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!"); 277 return mRequest ? CallQueryInterface(mRequest, aChannel) 278 : NS_ERROR_NOT_AVAILABLE; 279 } 280 281 std::tuple<NotNull<const Encoding*>, nsCharsetSource> 282 nsHtml5StreamParser::GuessEncoding(bool aInitial) { 283 MOZ_ASSERT( 284 mCharsetSource != kCharsetFromFinalUserForcedAutoDetection && 285 mCharsetSource != 286 kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8InitialWasASCII && 287 mCharsetSource != 288 kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic && 289 mCharsetSource != 290 kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8GenericInitialWasASCII && 291 mCharsetSource != 292 kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content && 293 mCharsetSource != 294 kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8ContentInitialWasASCII && 295 mCharsetSource != 296 kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD && 297 mCharsetSource != 298 kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLDInitialWasASCII && 299 mCharsetSource != kCharsetFromFinalAutoDetectionFile); 300 auto ifHadBeenForced = mDetector->Guess(EmptyCString(), true); 301 auto encoding = 302 mForceAutoDetection 303 ? ifHadBeenForced 304 : mDetector->Guess(mTLD, mDecodingLocalFileWithoutTokenizing); 305 nsCharsetSource source = 306 aInitial 307 ? (mForceAutoDetection 308 ? kCharsetFromInitialUserForcedAutoDetection 309 : (mDecodingLocalFileWithoutTokenizing 310 ? kCharsetFromFinalAutoDetectionFile 311 : kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Generic)) 312 : (mForceAutoDetection 313 ? kCharsetFromFinalUserForcedAutoDetection 314 : (mDecodingLocalFileWithoutTokenizing 315 ? kCharsetFromFinalAutoDetectionFile 316 : kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic)); 317 if (source == kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic) { 318 if (encoding == ISO_2022_JP_ENCODING) { 319 if (EncodingDetector::TldMayAffectGuess(mTLD)) { 320 source = kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content; 321 } 322 } else if (!mDetectorHasSeenNonAscii) { 323 source = kCharsetFromInitialAutoDetectionASCII; // deliberately Initial 324 } else if (ifHadBeenForced == UTF_8_ENCODING) { 325 MOZ_ASSERT(mCharsetSource == kCharsetFromInitialAutoDetectionASCII || 326 mCharsetSource == 327 kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8 || 328 mEncoding == ISO_2022_JP_ENCODING); 329 source = kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8InitialWasASCII; 330 } else if (encoding != ifHadBeenForced) { 331 if (mCharsetSource == kCharsetFromInitialAutoDetectionASCII) { 332 source = 333 kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLDInitialWasASCII; 334 } else { 335 source = 336 kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD; 337 } 338 } else if (EncodingDetector::TldMayAffectGuess(mTLD)) { 339 if (mCharsetSource == kCharsetFromInitialAutoDetectionASCII) { 340 source = 341 kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8ContentInitialWasASCII; 342 } else { 343 source = kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content; 344 } 345 } else if (mCharsetSource == kCharsetFromInitialAutoDetectionASCII) { 346 source = 347 kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8GenericInitialWasASCII; 348 } 349 } else if (source == 350 kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Generic) { 351 if (encoding == ISO_2022_JP_ENCODING) { 352 if (EncodingDetector::TldMayAffectGuess(mTLD)) { 353 source = kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Content; 354 } 355 } else if (!mDetectorHasSeenNonAscii) { 356 source = kCharsetFromInitialAutoDetectionASCII; 357 } else if (ifHadBeenForced == UTF_8_ENCODING) { 358 source = kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8; 359 } else if (encoding != ifHadBeenForced) { 360 source = 361 kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD; 362 } else if (EncodingDetector::TldMayAffectGuess(mTLD)) { 363 source = kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Content; 364 } 365 } 366 return {encoding, source}; 367 } 368 369 void nsHtml5StreamParser::FeedDetector(Span<const uint8_t> aBuffer) { 370 #ifdef DEBUG 371 mStartedFeedingDetector = true; 372 #endif 373 MOZ_ASSERT(!mChardetEof); 374 mDetectorHasSeenNonAscii = mDetector->Feed(aBuffer, false); 375 } 376 377 void nsHtml5StreamParser::DetectorEof() { 378 #ifdef DEBUG 379 mStartedFeedingDetector = true; 380 #endif 381 if (mChardetEof) { 382 return; 383 } 384 mChardetEof = true; 385 mDetectorHasSeenNonAscii = mDetector->Feed(Span<const uint8_t>(), true); 386 } 387 388 void nsHtml5StreamParser::SetViewSourceTitle(nsIURI* aURL) { 389 MOZ_ASSERT(NS_IsMainThread()); 390 391 BrowsingContext* browsingContext = 392 mExecutor->GetDocument()->GetBrowsingContext(); 393 if (browsingContext && browsingContext->WatchedByDevTools()) { 394 mURIToSendToDevtools = aURL; 395 396 nsID uuid; 397 nsresult rv = nsID::GenerateUUIDInPlace(uuid); 398 if (!NS_FAILED(rv)) { 399 char buffer[NSID_LENGTH]; 400 uuid.ToProvidedString(buffer); 401 mUUIDForDevtools = NS_ConvertASCIItoUTF16(buffer); 402 } 403 mBrowserIdForDevtools = browsingContext->BrowserId(); 404 mBrowsingContextIDForDevtools = browsingContext->Id(); 405 } 406 407 if (aURL) { 408 nsCOMPtr<nsIURI> temp; 409 if (aURL->SchemeIs("view-source")) { 410 nsCOMPtr<nsINestedURI> nested = do_QueryInterface(aURL); 411 nested->GetInnerURI(getter_AddRefs(temp)); 412 } else { 413 temp = aURL; 414 } 415 if (temp->SchemeIs("data")) { 416 // Avoid showing potentially huge data: URLs. The three last bytes are 417 // UTF-8 for an ellipsis. 418 mViewSourceTitle.AssignLiteral("data:\xE2\x80\xA6"); 419 } else { 420 nsresult rv = temp->GetSpec(mViewSourceTitle); 421 if (NS_FAILED(rv)) { 422 mViewSourceTitle.AssignLiteral("\xE2\x80\xA6"); 423 } 424 } 425 } 426 } 427 428 nsresult 429 nsHtml5StreamParser::SetupDecodingAndWriteSniffingBufferAndCurrentSegment( 430 Span<const uint8_t> aPrefix, Span<const uint8_t> aFromSegment) { 431 NS_ASSERTION(IsParserThread(), "Wrong thread!"); 432 mUnicodeDecoder = mEncoding->NewDecoderWithBOMRemoval(); 433 nsresult rv = WriteStreamBytes(aPrefix); 434 NS_ENSURE_SUCCESS(rv, rv); 435 return WriteStreamBytes(aFromSegment); 436 } 437 438 void nsHtml5StreamParser::SetupDecodingFromBom( 439 NotNull<const Encoding*> aEncoding) { 440 MOZ_ASSERT(IsParserThread(), "Wrong thread!"); 441 mEncoding = aEncoding; 442 mDecodingLocalFileWithoutTokenizing = false; 443 mLookingForMetaCharset = false; 444 mBufferingBytes = false; 445 mUnicodeDecoder = mEncoding->NewDecoderWithoutBOMHandling(); 446 mCharsetSource = kCharsetFromByteOrderMark; 447 mForceAutoDetection = false; 448 mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource, false); 449 mBomState = BOM_SNIFFING_OVER; 450 if (mMode == VIEW_SOURCE_HTML) { 451 mTokenizer->StartViewSourceBodyContents(); 452 } 453 } 454 455 void nsHtml5StreamParser::SetupDecodingFromUtf16BogoXml( 456 NotNull<const Encoding*> aEncoding) { 457 MOZ_ASSERT(IsParserThread(), "Wrong thread!"); 458 mEncoding = aEncoding; 459 mDecodingLocalFileWithoutTokenizing = false; 460 mLookingForMetaCharset = false; 461 mBufferingBytes = false; 462 mUnicodeDecoder = mEncoding->NewDecoderWithoutBOMHandling(); 463 mCharsetSource = kCharsetFromXmlDeclarationUtf16; 464 mForceAutoDetection = false; 465 mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource, false); 466 mBomState = BOM_SNIFFING_OVER; 467 if (mMode == VIEW_SOURCE_HTML) { 468 mTokenizer->StartViewSourceBodyContents(); 469 } 470 auto dst = mLastBuffer->TailAsSpan(READ_BUFFER_SIZE); 471 dst[0] = '<'; 472 dst[1] = '?'; 473 dst[2] = 'x'; 474 mLastBuffer->AdvanceEnd(3); 475 MOZ_ASSERT(!mStartedFeedingDevTools); 476 OnNewContent(dst.To(3)); 477 } 478 479 size_t nsHtml5StreamParser::LengthOfLtContainingPrefixInSecondBuffer() { 480 MOZ_ASSERT(mBufferedBytes.Length() <= 2); 481 if (mBufferedBytes.Length() < 2) { 482 return 0; 483 } 484 Buffer<uint8_t>& second = mBufferedBytes[1]; 485 const uint8_t* elements = second.Elements(); 486 const uint8_t* lt = (const uint8_t*)memchr(elements, '>', second.Length()); 487 if (lt) { 488 return (lt - elements) + 1; 489 } 490 return 0; 491 } 492 493 nsresult nsHtml5StreamParser::SniffStreamBytes(Span<const uint8_t> aFromSegment, 494 bool aEof) { 495 MOZ_ASSERT(IsParserThread(), "Wrong thread!"); 496 MOZ_ASSERT_IF(aEof, aFromSegment.IsEmpty()); 497 498 if (mCharsetSource >= 499 kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8InitialWasASCII && 500 mCharsetSource <= kCharsetFromFinalUserForcedAutoDetection) { 501 if (mMode == PLAIN_TEXT || mMode == VIEW_SOURCE_PLAIN) { 502 mTreeBuilder->MaybeComplainAboutCharset("EncDetectorReloadPlain", true, 503 0); 504 } else { 505 mTreeBuilder->MaybeComplainAboutCharset("EncDetectorReload", true, 0); 506 } 507 } 508 509 // mEncoding and mCharsetSource potentially have come from channel or higher 510 // by now. If we find a BOM, SetupDecodingFromBom() will overwrite them. 511 // If we don't find a BOM, the previously set values of mEncoding and 512 // mCharsetSource are not modified by the BOM sniffing here. 513 static uint8_t utf8[] = {0xEF, 0xBB}; 514 static uint8_t utf16le[] = {0xFF}; 515 static uint8_t utf16be[] = {0xFE}; 516 static uint8_t utf16leXml[] = {'<', 0x00, '?', 0x00, 'x'}; 517 static uint8_t utf16beXml[] = {0x00, '<', 0x00, '?', 0x00}; 518 // Buffer for replaying past bytes based on state machine state. If 519 // writing this from scratch, probably wouldn't do it this way, but 520 // let's keep the changes to a minimum. 521 const uint8_t* prefix = utf8; 522 size_t prefixLength = 0; 523 if (aEof && mBomState == BOM_SNIFFING_NOT_STARTED) { 524 // Avoid handling aEof in the BOM_SNIFFING_NOT_STARTED state below. 525 mBomState = BOM_SNIFFING_OVER; 526 } 527 for (size_t i = 0; 528 (i < aFromSegment.Length() && mBomState != BOM_SNIFFING_OVER) || aEof; 529 i++) { 530 switch (mBomState) { 531 case BOM_SNIFFING_NOT_STARTED: 532 MOZ_ASSERT(i == 0, "Bad BOM sniffing state."); 533 MOZ_ASSERT(!aEof, "Should have checked for aEof above!"); 534 switch (aFromSegment[0]) { 535 case 0xEF: 536 mBomState = SEEN_UTF_8_FIRST_BYTE; 537 break; 538 case 0xFF: 539 mBomState = SEEN_UTF_16_LE_FIRST_BYTE; 540 break; 541 case 0xFE: 542 mBomState = SEEN_UTF_16_BE_FIRST_BYTE; 543 break; 544 case 0x00: 545 if (mCharsetSource < kCharsetFromXmlDeclarationUtf16 && 546 mCharsetSource != kCharsetFromChannel) { 547 mBomState = SEEN_UTF_16_BE_XML_FIRST; 548 } else { 549 mBomState = BOM_SNIFFING_OVER; 550 } 551 break; 552 case '<': 553 if (mCharsetSource < kCharsetFromXmlDeclarationUtf16 && 554 mCharsetSource != kCharsetFromChannel) { 555 mBomState = SEEN_UTF_16_LE_XML_FIRST; 556 } else { 557 mBomState = BOM_SNIFFING_OVER; 558 } 559 break; 560 default: 561 mBomState = BOM_SNIFFING_OVER; 562 break; 563 } 564 break; 565 case SEEN_UTF_16_LE_FIRST_BYTE: 566 if (!aEof && aFromSegment[i] == 0xFE) { 567 SetupDecodingFromBom(UTF_16LE_ENCODING); 568 return WriteStreamBytes(aFromSegment.From(i + 1)); 569 } 570 prefix = utf16le; 571 prefixLength = 1 - i; 572 mBomState = BOM_SNIFFING_OVER; 573 break; 574 case SEEN_UTF_16_BE_FIRST_BYTE: 575 if (!aEof && aFromSegment[i] == 0xFF) { 576 SetupDecodingFromBom(UTF_16BE_ENCODING); 577 return WriteStreamBytes(aFromSegment.From(i + 1)); 578 } 579 prefix = utf16be; 580 prefixLength = 1 - i; 581 mBomState = BOM_SNIFFING_OVER; 582 break; 583 case SEEN_UTF_8_FIRST_BYTE: 584 if (!aEof && aFromSegment[i] == 0xBB) { 585 mBomState = SEEN_UTF_8_SECOND_BYTE; 586 } else { 587 prefixLength = 1 - i; 588 mBomState = BOM_SNIFFING_OVER; 589 } 590 break; 591 case SEEN_UTF_8_SECOND_BYTE: 592 if (!aEof && aFromSegment[i] == 0xBF) { 593 SetupDecodingFromBom(UTF_8_ENCODING); 594 return WriteStreamBytes(aFromSegment.From(i + 1)); 595 } 596 prefixLength = 2 - i; 597 mBomState = BOM_SNIFFING_OVER; 598 break; 599 case SEEN_UTF_16_BE_XML_FIRST: 600 if (!aEof && aFromSegment[i] == '<') { 601 mBomState = SEEN_UTF_16_BE_XML_SECOND; 602 } else { 603 prefix = utf16beXml; 604 prefixLength = 1 - i; 605 mBomState = BOM_SNIFFING_OVER; 606 } 607 break; 608 case SEEN_UTF_16_BE_XML_SECOND: 609 if (!aEof && aFromSegment[i] == 0x00) { 610 mBomState = SEEN_UTF_16_BE_XML_THIRD; 611 } else { 612 prefix = utf16beXml; 613 prefixLength = 2 - i; 614 mBomState = BOM_SNIFFING_OVER; 615 } 616 break; 617 case SEEN_UTF_16_BE_XML_THIRD: 618 if (!aEof && aFromSegment[i] == '?') { 619 mBomState = SEEN_UTF_16_BE_XML_FOURTH; 620 } else { 621 prefix = utf16beXml; 622 prefixLength = 3 - i; 623 mBomState = BOM_SNIFFING_OVER; 624 } 625 break; 626 case SEEN_UTF_16_BE_XML_FOURTH: 627 if (!aEof && aFromSegment[i] == 0x00) { 628 mBomState = SEEN_UTF_16_BE_XML_FIFTH; 629 } else { 630 prefix = utf16beXml; 631 prefixLength = 4 - i; 632 mBomState = BOM_SNIFFING_OVER; 633 } 634 break; 635 case SEEN_UTF_16_BE_XML_FIFTH: 636 if (!aEof && aFromSegment[i] == 'x') { 637 SetupDecodingFromUtf16BogoXml(UTF_16BE_ENCODING); 638 return WriteStreamBytes(aFromSegment.From(i + 1)); 639 } 640 prefix = utf16beXml; 641 prefixLength = 5 - i; 642 mBomState = BOM_SNIFFING_OVER; 643 break; 644 case SEEN_UTF_16_LE_XML_FIRST: 645 if (!aEof && aFromSegment[i] == 0x00) { 646 mBomState = SEEN_UTF_16_LE_XML_SECOND; 647 } else { 648 if (!aEof && aFromSegment[i] == '?' && 649 !(mMode == PLAIN_TEXT || mMode == VIEW_SOURCE_PLAIN)) { 650 mStartsWithLtQuestion = true; 651 } 652 prefix = utf16leXml; 653 prefixLength = 1 - i; 654 mBomState = BOM_SNIFFING_OVER; 655 } 656 break; 657 case SEEN_UTF_16_LE_XML_SECOND: 658 if (!aEof && aFromSegment[i] == '?') { 659 mBomState = SEEN_UTF_16_LE_XML_THIRD; 660 } else { 661 prefix = utf16leXml; 662 prefixLength = 2 - i; 663 mBomState = BOM_SNIFFING_OVER; 664 } 665 break; 666 case SEEN_UTF_16_LE_XML_THIRD: 667 if (!aEof && aFromSegment[i] == 0x00) { 668 mBomState = SEEN_UTF_16_LE_XML_FOURTH; 669 } else { 670 prefix = utf16leXml; 671 prefixLength = 3 - i; 672 mBomState = BOM_SNIFFING_OVER; 673 } 674 break; 675 case SEEN_UTF_16_LE_XML_FOURTH: 676 if (!aEof && aFromSegment[i] == 'x') { 677 mBomState = SEEN_UTF_16_LE_XML_FIFTH; 678 } else { 679 prefix = utf16leXml; 680 prefixLength = 4 - i; 681 mBomState = BOM_SNIFFING_OVER; 682 } 683 break; 684 case SEEN_UTF_16_LE_XML_FIFTH: 685 if (!aEof && aFromSegment[i] == 0x00) { 686 SetupDecodingFromUtf16BogoXml(UTF_16LE_ENCODING); 687 return WriteStreamBytes(aFromSegment.From(i + 1)); 688 } 689 prefix = utf16leXml; 690 prefixLength = 5 - i; 691 mBomState = BOM_SNIFFING_OVER; 692 break; 693 default: 694 mBomState = BOM_SNIFFING_OVER; 695 break; 696 } 697 if (aEof) { 698 break; 699 } 700 } 701 // if we get here, there either was no BOM or the BOM sniffing isn't complete 702 // yet 703 704 MOZ_ASSERT(mCharsetSource != kCharsetFromByteOrderMark, 705 "Should not come here if BOM was found."); 706 MOZ_ASSERT(mCharsetSource != kCharsetFromXmlDeclarationUtf16, 707 "Should not come here if UTF-16 bogo-XML declaration was found."); 708 MOZ_ASSERT(mCharsetSource != kCharsetFromOtherComponent, 709 "kCharsetFromOtherComponent is for XSLT."); 710 711 if (mBomState == BOM_SNIFFING_OVER) { 712 if (mMode == VIEW_SOURCE_XML && mStartsWithLtQuestion && 713 mCharsetSource < kCharsetFromChannel) { 714 // Sniff for XML declaration only. 715 MOZ_ASSERT(!mLookingForXmlDeclarationForXmlViewSource); 716 MOZ_ASSERT(!aEof); 717 MOZ_ASSERT(!mLookingForMetaCharset); 718 MOZ_ASSERT(!mDecodingLocalFileWithoutTokenizing); 719 // Maybe we've already buffered a '>'. 720 MOZ_ASSERT(!mBufferedBytes.IsEmpty(), 721 "How did at least <? not get buffered?"); 722 Buffer<uint8_t>& first = mBufferedBytes[0]; 723 const Encoding* encoding = 724 xmldecl_parse(first.Elements(), first.Length()); 725 if (encoding) { 726 mEncoding = WrapNotNull(encoding); 727 mCharsetSource = kCharsetFromXmlDeclaration; 728 } else if (memchr(first.Elements(), '>', first.Length())) { 729 // There was a '>', but an encoding still wasn't found. 730 ; // fall through to commit to the UTF-8 default. 731 } else if (size_t lengthOfPrefix = 732 LengthOfLtContainingPrefixInSecondBuffer()) { 733 // This can only happen if the first buffer was a lone '<', because 734 // we come here upon seeing the second byte '?' if the first two bytes 735 // were "<?". That is, the only way how we aren't dealing with the first 736 // buffer is if the first buffer only contained a single '<' and we are 737 // dealing with the second buffer that starts with '?'. 738 MOZ_ASSERT(first.Length() == 1); 739 MOZ_ASSERT(mBufferedBytes[1][0] == '?'); 740 // Our scanner for XML declaration-like syntax wants to see a contiguous 741 // buffer, so let's linearize the data. (Ideally, the XML declaration 742 // scanner would be incremental, but this is the rare path anyway.) 743 Vector<uint8_t> contiguous; 744 if (!contiguous.append(first.Elements(), first.Length())) { 745 MarkAsBroken(NS_ERROR_OUT_OF_MEMORY); 746 return NS_ERROR_OUT_OF_MEMORY; 747 } 748 if (!contiguous.append(mBufferedBytes[1].Elements(), lengthOfPrefix)) { 749 MarkAsBroken(NS_ERROR_OUT_OF_MEMORY); 750 return NS_ERROR_OUT_OF_MEMORY; 751 } 752 encoding = xmldecl_parse(contiguous.begin(), contiguous.length()); 753 if (encoding) { 754 mEncoding = WrapNotNull(encoding); 755 mCharsetSource = kCharsetFromXmlDeclaration; 756 } 757 // else no XML decl, commit to the UTF-8 default. 758 } else { 759 MOZ_ASSERT(mBufferingBytes); 760 mLookingForXmlDeclarationForXmlViewSource = true; 761 return NS_OK; 762 } 763 } else if (mMode != VIEW_SOURCE_XML && 764 (mForceAutoDetection || mCharsetSource < kCharsetFromChannel)) { 765 // In order to use the buffering logic for meta with mForceAutoDetection, 766 // we set mLookingForMetaCharset but still actually potentially ignore the 767 // meta. 768 mFirstBufferOfMetaScan = mFirstBuffer; 769 MOZ_ASSERT(mLookingForMetaCharset); 770 771 if (mMode == VIEW_SOURCE_HTML) { 772 auto r = mTokenizer->FlushViewSource(); 773 if (r.isErr()) { 774 return r.unwrapErr(); 775 } 776 } 777 auto r = mTreeBuilder->Flush(); 778 if (r.isErr()) { 779 return r.unwrapErr(); 780 } 781 // Encoding committer flushes the ops on the main thread. 782 783 mozilla::MutexAutoLock speculationAutoLock(mSpeculationMutex); 784 nsHtml5Speculation* speculation = new nsHtml5Speculation( 785 mFirstBuffer, mFirstBuffer->getStart(), mTokenizer->getLineNumber(), 786 mTokenizer->getColumnNumber(), mTreeBuilder->newSnapshot()); 787 MOZ_ASSERT(!mFlushTimerArmed, "How did we end up arming the timer?"); 788 if (mMode == VIEW_SOURCE_HTML) { 789 mTokenizer->SetViewSourceOpSink(speculation); 790 mTokenizer->StartViewSourceBodyContents(); 791 } else { 792 MOZ_ASSERT(mMode != VIEW_SOURCE_XML); 793 mTreeBuilder->SetOpSink(speculation); 794 } 795 mSpeculations.AppendElement(speculation); // adopts the pointer 796 mSpeculating = true; 797 } else { 798 mLookingForMetaCharset = false; 799 mBufferingBytes = false; 800 mDecodingLocalFileWithoutTokenizing = false; 801 if (mMode == VIEW_SOURCE_HTML) { 802 mTokenizer->StartViewSourceBodyContents(); 803 } 804 } 805 mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource, false); 806 return SetupDecodingAndWriteSniffingBufferAndCurrentSegment( 807 Span(prefix, prefixLength), aFromSegment); 808 } 809 810 return NS_OK; 811 } 812 813 class AddContentRunnable : public Runnable { 814 public: 815 AddContentRunnable(const nsAString& aParserID, uint64_t aBrowserId, 816 uint64_t aBrowsingContextID, nsIURI* aURI, 817 Span<const char16_t> aData, bool aComplete) 818 : Runnable("AddContent") { 819 nsAutoCString spec; 820 aURI->GetSpec(spec); 821 mData.mUri.Construct(NS_ConvertUTF8toUTF16(spec)); 822 mData.mParserID.Construct(aParserID); 823 mData.mBrowserId.Construct(aBrowserId); 824 mData.mBrowsingContextID.Construct(aBrowsingContextID); 825 mData.mContents.Construct(aData.Elements(), aData.Length()); 826 mData.mComplete.Construct(aComplete); 827 } 828 829 NS_IMETHOD Run() override { 830 nsAutoString json; 831 if (!mData.ToJSON(json)) { 832 return NS_ERROR_FAILURE; 833 } 834 835 nsCOMPtr<nsIObserverService> obsService = services::GetObserverService(); 836 if (obsService) { 837 obsService->NotifyObservers(nullptr, "devtools-html-content", 838 PromiseFlatString(json).get()); 839 } 840 841 return NS_OK; 842 } 843 844 HTMLContent mData; 845 }; 846 847 inline void nsHtml5StreamParser::OnNewContent(Span<const char16_t> aData) { 848 #ifdef DEBUG 849 mStartedFeedingDevTools = true; 850 #endif 851 if (mURIToSendToDevtools) { 852 if (aData.IsEmpty()) { 853 // Optimize out the runnable. 854 return; 855 } 856 NS_DispatchToMainThread(new AddContentRunnable( 857 mUUIDForDevtools, mBrowserIdForDevtools, mBrowsingContextIDForDevtools, 858 mURIToSendToDevtools, aData, 859 /* aComplete */ false)); 860 } 861 } 862 863 inline void nsHtml5StreamParser::OnContentComplete() { 864 #ifdef DEBUG 865 mStartedFeedingDevTools = true; 866 #endif 867 if (mURIToSendToDevtools) { 868 NS_DispatchToMainThread(new AddContentRunnable( 869 mUUIDForDevtools, mBrowserIdForDevtools, mBrowsingContextIDForDevtools, 870 mURIToSendToDevtools, Span<const char16_t>(), 871 /* aComplete */ true)); 872 mURIToSendToDevtools = nullptr; 873 mBrowserIdForDevtools = 0; 874 mBrowsingContextIDForDevtools = 0; 875 } 876 } 877 878 nsresult nsHtml5StreamParser::WriteStreamBytes( 879 Span<const uint8_t> aFromSegment) { 880 NS_ASSERTION(IsParserThread(), "Wrong thread!"); 881 mTokenizerMutex.AssertCurrentThreadOwns(); 882 // mLastBuffer should always point to a buffer of the size 883 // READ_BUFFER_SIZE. 884 if (!mLastBuffer) { 885 NS_WARNING("mLastBuffer should not be null!"); 886 MarkAsBroken(NS_ERROR_NULL_POINTER); 887 return NS_ERROR_NULL_POINTER; 888 } 889 size_t totalRead = 0; 890 auto src = aFromSegment; 891 for (;;) { 892 auto dst = mLastBuffer->TailAsSpan(READ_BUFFER_SIZE); 893 auto [result, read, written, hadErrors] = 894 mUnicodeDecoder->DecodeToUTF16(src, dst, false); 895 if (!(mLookingForMetaCharset || mDecodingLocalFileWithoutTokenizing)) { 896 OnNewContent(dst.To(written)); 897 } 898 if (hadErrors && !mHasHadErrors) { 899 mHasHadErrors = true; 900 if (mEncoding == UTF_8_ENCODING) { 901 mTreeBuilder->TryToEnableEncodingMenu(); 902 } 903 } 904 src = src.From(read); 905 totalRead += read; 906 mLastBuffer->AdvanceEnd(written); 907 if (result == kOutputFull) { 908 RefPtr<nsHtml5OwningUTF16Buffer> newBuf = 909 nsHtml5OwningUTF16Buffer::FalliblyCreate(READ_BUFFER_SIZE); 910 if (!newBuf) { 911 MarkAsBroken(NS_ERROR_OUT_OF_MEMORY); 912 return NS_ERROR_OUT_OF_MEMORY; 913 } 914 mLastBuffer = (mLastBuffer->next = std::move(newBuf)); 915 } else { 916 MOZ_ASSERT(totalRead == aFromSegment.Length(), 917 "The Unicode decoder consumed the wrong number of bytes."); 918 (void)totalRead; 919 if (!mLookingForMetaCharset && mDecodingLocalFileWithoutTokenizing && 920 mNumBytesBuffered == LOCAL_FILE_UTF_8_BUFFER_SIZE) { 921 MOZ_ASSERT(!mStartedFeedingDetector); 922 for (auto&& buffer : mBufferedBytes) { 923 FeedDetector(buffer); 924 } 925 // If the file is exactly LOCAL_FILE_UTF_8_BUFFER_SIZE bytes long 926 // we end up not considering the EOF. That's not fatal, since we 927 // don't consider the EOF if the file is 928 // LOCAL_FILE_UTF_8_BUFFER_SIZE + 1 bytes long. 929 auto [encoding, source] = GuessEncoding(true); 930 mCharsetSource = source; 931 if (encoding != mEncoding) { 932 mEncoding = encoding; 933 nsresult rv = ReDecodeLocalFile(); 934 if (NS_FAILED(rv)) { 935 return rv; 936 } 937 } else { 938 MOZ_ASSERT(mEncoding == UTF_8_ENCODING); 939 nsresult rv = CommitLocalFileToEncoding(); 940 if (NS_FAILED(rv)) { 941 return rv; 942 } 943 } 944 } 945 return NS_OK; 946 } 947 } 948 } 949 950 [[nodiscard]] nsresult nsHtml5StreamParser::ReDecodeLocalFile() { 951 MOZ_ASSERT(mDecodingLocalFileWithoutTokenizing && !mLookingForMetaCharset); 952 MOZ_ASSERT(mFirstBufferOfMetaScan); 953 MOZ_ASSERT(mCharsetSource == kCharsetFromFinalAutoDetectionFile || 954 (mForceAutoDetection && 955 mCharsetSource == kCharsetFromInitialUserForcedAutoDetection)); 956 957 DiscardMetaSpeculation(); 958 959 MOZ_ASSERT(mEncoding != UTF_8_ENCODING); 960 961 mDecodingLocalFileWithoutTokenizing = false; 962 963 mEncoding->NewDecoderWithBOMRemovalInto(*mUnicodeDecoder); 964 mHasHadErrors = false; 965 966 // Throw away previous decoded data 967 mLastBuffer = mFirstBuffer; 968 mLastBuffer->next = nullptr; 969 mLastBuffer->setStart(0); 970 mLastBuffer->setEnd(0); 971 972 mBufferingBytes = false; 973 mForceAutoDetection = false; // To stop feeding the detector 974 mFirstBufferOfMetaScan = nullptr; 975 976 mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource, true); 977 978 // Decode again 979 for (auto&& buffer : mBufferedBytes) { 980 DoDataAvailable(buffer); 981 } 982 983 if (mMode == VIEW_SOURCE_HTML) { 984 auto r = mTokenizer->FlushViewSource(); 985 if (r.isErr()) { 986 return r.unwrapErr(); 987 } 988 } 989 auto r = mTreeBuilder->Flush(); 990 if (r.isErr()) { 991 return r.unwrapErr(); 992 } 993 return NS_OK; 994 } 995 996 [[nodiscard]] nsresult nsHtml5StreamParser::CommitLocalFileToEncoding() { 997 MOZ_ASSERT(mDecodingLocalFileWithoutTokenizing && !mLookingForMetaCharset); 998 MOZ_ASSERT(mFirstBufferOfMetaScan); 999 mDecodingLocalFileWithoutTokenizing = false; 1000 MOZ_ASSERT(mCharsetSource == kCharsetFromFinalAutoDetectionFile || 1001 (mForceAutoDetection && 1002 mCharsetSource == kCharsetFromInitialUserForcedAutoDetection)); 1003 MOZ_ASSERT(mEncoding == UTF_8_ENCODING); 1004 1005 MOZ_ASSERT(!mStartedFeedingDevTools); 1006 if (mURIToSendToDevtools) { 1007 nsHtml5OwningUTF16Buffer* buffer = mFirstBufferOfMetaScan; 1008 while (buffer) { 1009 Span<const char16_t> data(buffer->getBuffer() + buffer->getStart(), 1010 buffer->getLength()); 1011 OnNewContent(data); 1012 buffer = buffer->next; 1013 } 1014 } 1015 1016 mFirstBufferOfMetaScan = nullptr; 1017 1018 mBufferingBytes = false; 1019 mForceAutoDetection = false; // To stop feeding the detector 1020 mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource, true); 1021 if (mMode == VIEW_SOURCE_HTML) { 1022 auto r = mTokenizer->FlushViewSource(); 1023 if (r.isErr()) { 1024 return r.unwrapErr(); 1025 } 1026 } 1027 auto r = mTreeBuilder->Flush(); 1028 if (r.isErr()) { 1029 return r.unwrapErr(); 1030 } 1031 return NS_OK; 1032 } 1033 1034 class MaybeRunCollector : public Runnable { 1035 public: 1036 explicit MaybeRunCollector(nsIDocShell* aDocShell) 1037 : Runnable("MaybeRunCollector"), mDocShell(aDocShell) {} 1038 1039 NS_IMETHOD Run() override { 1040 nsJSContext::MaybeRunNextCollectorSlice(mDocShell, 1041 JS::GCReason::HTML_PARSER); 1042 return NS_OK; 1043 } 1044 1045 nsCOMPtr<nsIDocShell> mDocShell; 1046 }; 1047 1048 nsresult nsHtml5StreamParser::OnStartRequest(nsIRequest* aRequest) { 1049 MOZ_RELEASE_ASSERT(STREAM_NOT_STARTED == mStreamState, 1050 "Got OnStartRequest when the stream had already started."); 1051 MOZ_ASSERT( 1052 !mExecutor->HasStarted(), 1053 "Got OnStartRequest at the wrong stage in the executor life cycle."); 1054 MOZ_ASSERT(NS_IsMainThread(), "Wrong thread!"); 1055 1056 // To avoid the cost of instantiating the detector when it's not needed, 1057 // let's instantiate only if we make it out of this method with the 1058 // intent to use it. 1059 auto detectorCreator = MakeScopeExit([&] { 1060 if ((mForceAutoDetection || mCharsetSource < kCharsetFromParentFrame) || 1061 !(mMode == LOAD_AS_DATA || mMode == VIEW_SOURCE_XML)) { 1062 mDetector = mozilla::EncodingDetector::Create(); 1063 } 1064 }); 1065 1066 mRequest = aRequest; 1067 1068 mStreamState = STREAM_BEING_READ; 1069 1070 // For View Source, the parser should run with scripts "enabled" if a normal 1071 // load would have scripts enabled. 1072 bool scriptingEnabled = 1073 mMode == LOAD_AS_DATA ? false : mExecutor->IsScriptEnabled(); 1074 mOwner->StartTokenizer(scriptingEnabled); 1075 1076 MOZ_ASSERT(!mDecodingLocalFileWithoutTokenizing); 1077 bool isSrcdoc = false; 1078 nsCOMPtr<nsIChannel> channel; 1079 nsresult rv = GetChannel(getter_AddRefs(channel)); 1080 if (NS_SUCCEEDED(rv)) { 1081 isSrcdoc = NS_IsSrcdocChannel(channel); 1082 if (!isSrcdoc && mCharsetSource <= kCharsetFromFallback) { 1083 nsCOMPtr<nsIURI> originalURI; 1084 rv = channel->GetOriginalURI(getter_AddRefs(originalURI)); 1085 if (NS_SUCCEEDED(rv)) { 1086 if (originalURI->SchemeIs("resource")) { 1087 mCharsetSource = kCharsetFromBuiltIn; 1088 mEncoding = UTF_8_ENCODING; 1089 mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource, false); 1090 } else { 1091 nsCOMPtr<nsIURI> currentURI; 1092 rv = channel->GetURI(getter_AddRefs(currentURI)); 1093 if (NS_SUCCEEDED(rv)) { 1094 nsCOMPtr<nsIURI> innermost = NS_GetInnermostURI(currentURI); 1095 if (innermost->SchemeIs("file")) { 1096 MOZ_ASSERT(mEncoding == UTF_8_ENCODING); 1097 if (!(mMode == LOAD_AS_DATA || mMode == VIEW_SOURCE_XML)) { 1098 mDecodingLocalFileWithoutTokenizing = true; 1099 } 1100 } else { 1101 nsAutoCString host; 1102 innermost->GetAsciiHost(host); 1103 if (!host.IsEmpty()) { 1104 // First let's see if the host is DNS-absolute and ends with a 1105 // dot and get rid of that one. 1106 if (host.Last() == '.') { 1107 host.SetLength(host.Length() - 1); 1108 } 1109 int32_t index = host.RFindChar('.'); 1110 if (index != kNotFound) { 1111 // We tolerate an IPv4 component as generic "TLD", so don't 1112 // bother checking. 1113 ToLowerCase( 1114 Substring(host, index + 1, host.Length() - (index + 1)), 1115 mTLD); 1116 } 1117 } 1118 } 1119 } 1120 } 1121 } 1122 } 1123 } 1124 mTreeBuilder->setIsSrcdocDocument(isSrcdoc); 1125 mTreeBuilder->setScriptingEnabled(scriptingEnabled); 1126 mTreeBuilder->SetPreventScriptExecution( 1127 !((mMode == NORMAL) && scriptingEnabled)); 1128 mTreeBuilder->setAllowDeclarativeShadowRoots( 1129 mExecutor->GetDocument()->AllowsDeclarativeShadowRoots()); 1130 mTokenizer->start(); 1131 mExecutor->Start(); 1132 mExecutor->StartReadingFromStage(); 1133 1134 if (mMode == PLAIN_TEXT) { 1135 mTreeBuilder->StartPlainText(); 1136 mTokenizer->StartPlainText(); 1137 MOZ_ASSERT( 1138 mTemplatePushedOrHeadPopped); // Needed to force 1024-byte sniffing 1139 // Flush the ops to put them where ContinueAfterScriptsOrEncodingCommitment 1140 // can find them. 1141 auto r = mTreeBuilder->Flush(); 1142 if (r.isErr()) { 1143 return mExecutor->MarkAsBroken(r.unwrapErr()); 1144 } 1145 } else if (mMode == VIEW_SOURCE_PLAIN) { 1146 nsAutoString viewSourceTitle; 1147 CopyUTF8toUTF16(mViewSourceTitle, viewSourceTitle); 1148 mTreeBuilder->EnsureBufferSpace(viewSourceTitle.Length()); 1149 mTreeBuilder->StartPlainTextViewSource(viewSourceTitle); 1150 mTokenizer->StartPlainText(); 1151 MOZ_ASSERT( 1152 mTemplatePushedOrHeadPopped); // Needed to force 1024-byte sniffing 1153 // Flush the ops to put them where ContinueAfterScriptsOrEncodingCommitment 1154 // can find them. 1155 auto r = mTreeBuilder->Flush(); 1156 if (r.isErr()) { 1157 return mExecutor->MarkAsBroken(r.unwrapErr()); 1158 } 1159 } else if (mMode == VIEW_SOURCE_HTML || mMode == VIEW_SOURCE_XML) { 1160 // Generate and flush the View Source document up to and including the 1161 // pre element start. 1162 mTokenizer->StartViewSource(NS_ConvertUTF8toUTF16(mViewSourceTitle)); 1163 if (mMode == VIEW_SOURCE_XML) { 1164 mTokenizer->StartViewSourceBodyContents(); 1165 } 1166 // Flush the ops to put them where ContinueAfterScriptsOrEncodingCommitment 1167 // can find them. 1168 auto r = mTokenizer->FlushViewSource(); 1169 if (r.isErr()) { 1170 return mExecutor->MarkAsBroken(r.unwrapErr()); 1171 } 1172 } 1173 1174 /* 1175 * If you move the following line, be very careful not to cause 1176 * WillBuildModel to be called before the document has had its 1177 * script global object set. 1178 */ 1179 rv = mExecutor->WillBuildModel(); 1180 NS_ENSURE_SUCCESS(rv, rv); 1181 1182 RefPtr<nsHtml5OwningUTF16Buffer> newBuf = 1183 nsHtml5OwningUTF16Buffer::FalliblyCreate(READ_BUFFER_SIZE); 1184 if (!newBuf) { 1185 // marks this stream parser as terminated, 1186 // which prevents entry to code paths that 1187 // would use mFirstBuffer or mLastBuffer. 1188 return mExecutor->MarkAsBroken(NS_ERROR_OUT_OF_MEMORY); 1189 } 1190 MOZ_ASSERT(!mFirstBuffer, "How come we have the first buffer set?"); 1191 MOZ_ASSERT(!mLastBuffer, "How come we have the last buffer set?"); 1192 mFirstBuffer = mLastBuffer = newBuf; 1193 1194 rv = NS_OK; 1195 1196 nsCOMPtr<nsIHttpChannel> httpChannel(do_QueryInterface(mRequest, &rv)); 1197 if (NS_SUCCEEDED(rv)) { 1198 nsAutoCString method; 1199 (void)httpChannel->GetRequestMethod(method); 1200 // XXX does Necko have a way to renavigate POST, etc. without hitting 1201 // the network? 1202 if (!method.EqualsLiteral("GET")) { 1203 // This is the old Gecko behavior but the HTML5 spec disagrees. 1204 // Don't reparse on POST. 1205 mReparseForbidden = true; 1206 } 1207 } 1208 1209 // Attempt to retarget delivery of data (via OnDataAvailable) to the parser 1210 // thread, rather than through the main thread. 1211 nsCOMPtr<nsIThreadRetargetableRequest> threadRetargetableRequest = 1212 do_QueryInterface(mRequest, &rv); 1213 if (threadRetargetableRequest) { 1214 rv = threadRetargetableRequest->RetargetDeliveryTo(mEventTarget); 1215 if (NS_SUCCEEDED(rv)) { 1216 // Parser thread should be now ready to get data from necko and parse it 1217 // and main thread might have a chance to process a collector slice. 1218 // We need to do this asynchronously so that necko may continue processing 1219 // the request. 1220 nsCOMPtr<nsIRunnable> runnable = 1221 new MaybeRunCollector(mExecutor->GetDocument()->GetDocShell()); 1222 mozilla::SchedulerGroup::Dispatch(runnable.forget()); 1223 } 1224 } 1225 1226 if (NS_FAILED(rv)) { 1227 NS_WARNING("Failed to retarget HTML data delivery to the parser thread."); 1228 } 1229 1230 if (mCharsetSource == kCharsetFromParentFrame) { 1231 // Remember this for error reporting. 1232 mInitialEncodingWasFromParentFrame = true; 1233 MOZ_ASSERT(!mDecodingLocalFileWithoutTokenizing); 1234 } 1235 1236 if (mForceAutoDetection || mCharsetSource < kCharsetFromChannel) { 1237 mBufferingBytes = true; 1238 if (mMode != VIEW_SOURCE_XML) { 1239 // We need to set mLookingForMetaCharset to true here in case the first 1240 // buffer to arrive is larger than 1024. We need the code that splits 1241 // the buffers at 1024 bytes to work even in that case. 1242 mLookingForMetaCharset = true; 1243 } 1244 } 1245 1246 if (mCharsetSource < kCharsetFromUtf8OnlyMime) { 1247 // we aren't ready to commit to an encoding yet 1248 // leave converter uninstantiated for now 1249 return NS_OK; 1250 } 1251 1252 MOZ_ASSERT(!(mMode == VIEW_SOURCE_HTML || mMode == VIEW_SOURCE_XML)); 1253 1254 MOZ_ASSERT(mEncoding == UTF_8_ENCODING, 1255 "How come UTF-8-only MIME type didn't set encoding to UTF-8?"); 1256 1257 // We are loading JSON/WebVTT/etc. into a browsing context. 1258 // There's no need to remove the BOM manually here, because 1259 // the UTF-8 decoder removes it. 1260 mReparseForbidden = true; 1261 mForceAutoDetection = false; 1262 1263 // Instantiate the converter here to avoid BOM sniffing. 1264 mDecodingLocalFileWithoutTokenizing = false; 1265 mUnicodeDecoder = mEncoding->NewDecoderWithBOMRemoval(); 1266 return NS_OK; 1267 } 1268 1269 void nsHtml5StreamParser::DoStopRequest() { 1270 MOZ_ASSERT(IsParserThread(), "Wrong thread!"); 1271 MOZ_RELEASE_ASSERT(STREAM_BEING_READ == mStreamState, 1272 "Stream ended without being open."); 1273 mTokenizerMutex.AssertCurrentThreadOwns(); 1274 1275 auto guard = MakeScopeExit([&] { OnContentComplete(); }); 1276 1277 if (IsTerminated()) { 1278 return; 1279 } 1280 1281 if (MOZ_UNLIKELY(mLookingForXmlDeclarationForXmlViewSource)) { 1282 mLookingForXmlDeclarationForXmlViewSource = false; 1283 mBufferingBytes = false; 1284 mUnicodeDecoder = mEncoding->NewDecoderWithoutBOMHandling(); 1285 mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource, false); 1286 1287 for (auto&& buffer : mBufferedBytes) { 1288 nsresult rv = WriteStreamBytes(buffer); 1289 if (NS_FAILED(rv)) { 1290 MarkAsBroken(rv); 1291 return; 1292 } 1293 } 1294 } else if (!mUnicodeDecoder) { 1295 nsresult rv; 1296 if (NS_FAILED(rv = SniffStreamBytes(Span<const uint8_t>(), true))) { 1297 MarkAsBroken(rv); 1298 return; 1299 } 1300 } 1301 1302 MOZ_ASSERT(mUnicodeDecoder, 1303 "Should have a decoder after finalizing sniffing."); 1304 1305 // mLastBuffer should always point to a buffer of the size 1306 // READ_BUFFER_SIZE. 1307 if (!mLastBuffer) { 1308 NS_WARNING("mLastBuffer should not be null!"); 1309 MarkAsBroken(NS_ERROR_NULL_POINTER); 1310 return; 1311 } 1312 1313 Span<uint8_t> src; // empty span 1314 for (;;) { 1315 auto dst = mLastBuffer->TailAsSpan(READ_BUFFER_SIZE); 1316 uint32_t result; 1317 size_t read; 1318 size_t written; 1319 bool hadErrors; 1320 // Do not use structured binding lest deal with [-Werror=unused-variable] 1321 std::tie(result, read, written, hadErrors) = 1322 mUnicodeDecoder->DecodeToUTF16(src, dst, true); 1323 if (!(mLookingForMetaCharset || mDecodingLocalFileWithoutTokenizing)) { 1324 OnNewContent(dst.To(written)); 1325 } 1326 if (hadErrors) { 1327 mHasHadErrors = true; 1328 } 1329 MOZ_ASSERT(read == 0, "How come an empty span was read form?"); 1330 mLastBuffer->AdvanceEnd(written); 1331 if (result == kOutputFull) { 1332 RefPtr<nsHtml5OwningUTF16Buffer> newBuf = 1333 nsHtml5OwningUTF16Buffer::FalliblyCreate(READ_BUFFER_SIZE); 1334 if (!newBuf) { 1335 MarkAsBroken(NS_ERROR_OUT_OF_MEMORY); 1336 return; 1337 } 1338 mLastBuffer = (mLastBuffer->next = std::move(newBuf)); 1339 } else { 1340 if (!mLookingForMetaCharset && mDecodingLocalFileWithoutTokenizing) { 1341 MOZ_ASSERT(mNumBytesBuffered < LOCAL_FILE_UTF_8_BUFFER_SIZE); 1342 MOZ_ASSERT(!mStartedFeedingDetector); 1343 for (auto&& buffer : mBufferedBytes) { 1344 FeedDetector(buffer); 1345 } 1346 MOZ_ASSERT(!mChardetEof); 1347 DetectorEof(); 1348 auto [encoding, source] = GuessEncoding(true); 1349 mCharsetSource = source; 1350 if (encoding != mEncoding) { 1351 mEncoding = encoding; 1352 nsresult rv = ReDecodeLocalFile(); 1353 if (NS_FAILED(rv)) { 1354 MarkAsBroken(rv); 1355 return; 1356 } 1357 DoStopRequest(); 1358 return; 1359 } 1360 MOZ_ASSERT(mEncoding == UTF_8_ENCODING); 1361 nsresult rv = CommitLocalFileToEncoding(); 1362 if (NS_FAILED(rv)) { 1363 MarkAsBroken(rv); 1364 return; 1365 } 1366 } 1367 break; 1368 } 1369 } 1370 1371 mStreamState = STREAM_ENDED; 1372 1373 if (IsTerminatedOrInterrupted()) { 1374 return; 1375 } 1376 1377 ParseAvailableData(); 1378 } 1379 1380 class nsHtml5RequestStopper : public Runnable { 1381 private: 1382 nsHtml5StreamParserPtr mStreamParser; 1383 1384 public: 1385 explicit nsHtml5RequestStopper(nsHtml5StreamParser* aStreamParser) 1386 : Runnable("nsHtml5RequestStopper"), mStreamParser(aStreamParser) {} 1387 NS_IMETHOD Run() override { 1388 mozilla::MutexAutoLock autoLock(mStreamParser->mTokenizerMutex); 1389 mStreamParser->DoStopRequest(); 1390 mStreamParser->PostLoadFlusher(); 1391 return NS_OK; 1392 } 1393 }; 1394 1395 nsresult nsHtml5StreamParser::OnStopRequest( 1396 nsIRequest* aRequest, nsresult status, 1397 const mozilla::ReentrantMonitorAutoEnter& aProofOfLock) { 1398 MOZ_ASSERT_IF(aRequest, mRequest == aRequest); 1399 if (mOnStopCalled) { 1400 // OnStopRequest already executed (probably OMT). 1401 MOZ_ASSERT(NS_IsMainThread(), "Expected to run on main thread"); 1402 } else { 1403 mOnStopCalled = true; 1404 1405 if (MOZ_UNLIKELY(NS_IsMainThread())) { 1406 nsCOMPtr<nsIRunnable> stopper = new nsHtml5RequestStopper(this); 1407 if (NS_FAILED( 1408 mEventTarget->Dispatch(stopper, nsIThread::DISPATCH_NORMAL))) { 1409 NS_WARNING("Dispatching StopRequest event failed."); 1410 } 1411 } else { 1412 if (StaticPrefs::network_send_OnDataFinished_html5parser()) { 1413 MOZ_ASSERT(IsParserThread(), "Wrong thread!"); 1414 mozilla::MutexAutoLock autoLock(mTokenizerMutex); 1415 DoStopRequest(); 1416 PostLoadFlusher(); 1417 } else { 1418 // Let the MainThread event handle this, even though it will just 1419 // send it back to this thread, so we can accurately judge the impact 1420 // of this change. This should eventually be removed 1421 mOnStopCalled = false; 1422 // don't record any telemetry for this 1423 return NS_OK; 1424 } 1425 } 1426 } 1427 return NS_OK; 1428 } 1429 1430 void nsHtml5StreamParser::DoDataAvailableBuffer( 1431 mozilla::Buffer<uint8_t>&& aBuffer) { 1432 if (MOZ_UNLIKELY(!mBufferingBytes)) { 1433 DoDataAvailable(aBuffer); 1434 return; 1435 } 1436 if (MOZ_UNLIKELY(mLookingForXmlDeclarationForXmlViewSource)) { 1437 const uint8_t* elements = aBuffer.Elements(); 1438 size_t length = aBuffer.Length(); 1439 const uint8_t* lt = (const uint8_t*)memchr(elements, '>', length); 1440 if (!lt) { 1441 mBufferedBytes.AppendElement(std::move(aBuffer)); 1442 return; 1443 } 1444 1445 // We found an '>'. Now there either is or isn't an XML decl. 1446 length = (lt - elements) + 1; 1447 Vector<uint8_t> contiguous; 1448 for (auto&& buffer : mBufferedBytes) { 1449 if (!contiguous.append(buffer.Elements(), buffer.Length())) { 1450 MarkAsBroken(NS_ERROR_OUT_OF_MEMORY); 1451 return; 1452 } 1453 } 1454 if (!contiguous.append(elements, length)) { 1455 MarkAsBroken(NS_ERROR_OUT_OF_MEMORY); 1456 return; 1457 } 1458 1459 const Encoding* encoding = 1460 xmldecl_parse(contiguous.begin(), contiguous.length()); 1461 if (encoding) { 1462 mEncoding = WrapNotNull(encoding); 1463 mCharsetSource = kCharsetFromXmlDeclaration; 1464 } 1465 1466 mLookingForXmlDeclarationForXmlViewSource = false; 1467 mBufferingBytes = false; 1468 mUnicodeDecoder = mEncoding->NewDecoderWithoutBOMHandling(); 1469 mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource, false); 1470 1471 for (auto&& buffer : mBufferedBytes) { 1472 DoDataAvailable(buffer); 1473 } 1474 DoDataAvailable(aBuffer); 1475 mBufferedBytes.Clear(); 1476 return; 1477 } 1478 CheckedInt<size_t> bufferedPlusLength(aBuffer.Length()); 1479 bufferedPlusLength += mNumBytesBuffered; 1480 if (!bufferedPlusLength.isValid()) { 1481 MarkAsBroken(NS_ERROR_OUT_OF_MEMORY); 1482 return; 1483 } 1484 // Ensure that WriteStreamBytes() sees buffers ending 1485 // exactly at the two special boundaries. 1486 bool metaBoundaryWithinBuffer = 1487 mLookingForMetaCharset && 1488 mNumBytesBuffered < UNCONDITIONAL_META_SCAN_BOUNDARY && 1489 bufferedPlusLength.value() > UNCONDITIONAL_META_SCAN_BOUNDARY; 1490 bool localFileLimitWithinBuffer = 1491 mDecodingLocalFileWithoutTokenizing && 1492 mNumBytesBuffered < LOCAL_FILE_UTF_8_BUFFER_SIZE && 1493 bufferedPlusLength.value() > LOCAL_FILE_UTF_8_BUFFER_SIZE; 1494 if (!metaBoundaryWithinBuffer && !localFileLimitWithinBuffer) { 1495 // Truncation OK, because we just checked the range. 1496 mNumBytesBuffered = bufferedPlusLength.value(); 1497 mBufferedBytes.AppendElement(std::move(aBuffer)); 1498 DoDataAvailable(mBufferedBytes.LastElement()); 1499 } else { 1500 MOZ_RELEASE_ASSERT( 1501 !(metaBoundaryWithinBuffer && localFileLimitWithinBuffer), 1502 "How can Necko give us a buffer this large?"); 1503 size_t boundary = metaBoundaryWithinBuffer 1504 ? UNCONDITIONAL_META_SCAN_BOUNDARY 1505 : LOCAL_FILE_UTF_8_BUFFER_SIZE; 1506 // Truncation OK, because the constant is small enough. 1507 size_t overBoundary = bufferedPlusLength.value() - boundary; 1508 MOZ_RELEASE_ASSERT(overBoundary < aBuffer.Length()); 1509 size_t untilBoundary = aBuffer.Length() - overBoundary; 1510 auto span = aBuffer.AsSpan(); 1511 auto head = span.To(untilBoundary); 1512 auto tail = span.From(untilBoundary); 1513 MOZ_RELEASE_ASSERT(mNumBytesBuffered + untilBoundary == boundary); 1514 // The following copies may end up being useless, but optimizing 1515 // them away would add complexity. 1516 Maybe<Buffer<uint8_t>> maybeHead = Buffer<uint8_t>::CopyFrom(head); 1517 if (maybeHead.isNothing()) { 1518 MarkAsBroken(NS_ERROR_OUT_OF_MEMORY); 1519 return; 1520 } 1521 mNumBytesBuffered = boundary; 1522 mBufferedBytes.AppendElement(std::move(*maybeHead)); 1523 DoDataAvailable(mBufferedBytes.LastElement()); 1524 // Re-decode may have happened here. 1525 1526 Maybe<Buffer<uint8_t>> maybeTail = Buffer<uint8_t>::CopyFrom(tail); 1527 if (maybeTail.isNothing()) { 1528 MarkAsBroken(NS_ERROR_OUT_OF_MEMORY); 1529 return; 1530 } 1531 mNumBytesBuffered += tail.Length(); 1532 mBufferedBytes.AppendElement(std::move(*maybeTail)); 1533 DoDataAvailable(mBufferedBytes.LastElement()); 1534 } 1535 // Do this clean-up here to avoid use-after-free when 1536 // DoDataAvailable is passed a span pointing into an 1537 // element of mBufferedBytes. 1538 if (!mBufferingBytes) { 1539 mBufferedBytes.Clear(); 1540 } 1541 } 1542 1543 void nsHtml5StreamParser::DoDataAvailable(Span<const uint8_t> aBuffer) { 1544 MOZ_ASSERT(IsParserThread(), "Wrong thread!"); 1545 MOZ_RELEASE_ASSERT(STREAM_BEING_READ == mStreamState, 1546 "DoDataAvailable called when stream not open."); 1547 mTokenizerMutex.AssertCurrentThreadOwns(); 1548 1549 if (IsTerminated()) { 1550 return; 1551 } 1552 1553 nsresult rv; 1554 if (HasDecoder()) { 1555 if ((mForceAutoDetection || mCharsetSource < kCharsetFromParentFrame) && 1556 !mBufferingBytes && !mReparseForbidden && 1557 !(mMode == LOAD_AS_DATA || mMode == VIEW_SOURCE_XML)) { 1558 MOZ_ASSERT(!mDecodingLocalFileWithoutTokenizing, 1559 "How is mBufferingBytes false if " 1560 "mDecodingLocalFileWithoutTokenizing is true?"); 1561 FeedDetector(aBuffer); 1562 } 1563 rv = WriteStreamBytes(aBuffer); 1564 } else { 1565 rv = SniffStreamBytes(aBuffer, false); 1566 } 1567 if (NS_FAILED(rv)) { 1568 MarkAsBroken(rv); 1569 return; 1570 } 1571 1572 if (IsTerminatedOrInterrupted()) { 1573 return; 1574 } 1575 1576 if (!mLookingForMetaCharset && mDecodingLocalFileWithoutTokenizing) { 1577 return; 1578 } 1579 1580 ParseAvailableData(); 1581 1582 if (mBomState != BOM_SNIFFING_OVER || mFlushTimerArmed || mSpeculating) { 1583 return; 1584 } 1585 1586 { 1587 mozilla::MutexAutoLock flushTimerLock(mFlushTimerMutex); 1588 mFlushTimer->InitWithNamedFuncCallback( 1589 nsHtml5StreamParser::TimerCallback, static_cast<void*>(this), 1590 mFlushTimerEverFired ? StaticPrefs::html5_flushtimer_initialdelay() 1591 : StaticPrefs::html5_flushtimer_subsequentdelay(), 1592 nsITimer::TYPE_ONE_SHOT, "nsHtml5StreamParser::DoDataAvailable"_ns); 1593 } 1594 mFlushTimerArmed = true; 1595 } 1596 1597 class nsHtml5DataAvailable : public Runnable { 1598 private: 1599 nsHtml5StreamParserPtr mStreamParser; 1600 Buffer<uint8_t> mData; 1601 1602 public: 1603 nsHtml5DataAvailable(nsHtml5StreamParser* aStreamParser, 1604 Buffer<uint8_t>&& aData) 1605 : Runnable("nsHtml5DataAvailable"), 1606 mStreamParser(aStreamParser), 1607 mData(std::move(aData)) {} 1608 NS_IMETHOD Run() override { 1609 mozilla::MutexAutoLock autoLock(mStreamParser->mTokenizerMutex); 1610 mStreamParser->DoDataAvailableBuffer(std::move(mData)); 1611 mStreamParser->PostLoadFlusher(); 1612 return NS_OK; 1613 } 1614 }; 1615 1616 nsresult nsHtml5StreamParser::OnDataAvailable(nsIRequest* aRequest, 1617 nsIInputStream* aInStream, 1618 uint64_t aSourceOffset, 1619 uint32_t aLength) { 1620 nsresult rv; 1621 1622 MOZ_ASSERT(mRequest == aRequest, "Got data on wrong stream."); 1623 uint32_t totalRead; 1624 // Main thread to parser thread dispatch requires copying to buffer first. 1625 if (MOZ_UNLIKELY(NS_IsMainThread())) { 1626 if (NS_FAILED(rv = mExecutor->IsBroken())) { 1627 return rv; 1628 } 1629 Maybe<Buffer<uint8_t>> maybe = Buffer<uint8_t>::Alloc(aLength); 1630 if (maybe.isNothing()) { 1631 return mExecutor->MarkAsBroken(NS_ERROR_OUT_OF_MEMORY); 1632 } 1633 Buffer<uint8_t> data(std::move(*maybe)); 1634 rv = aInStream->Read(reinterpret_cast<char*>(data.Elements()), 1635 data.Length(), &totalRead); 1636 NS_ENSURE_SUCCESS(rv, rv); 1637 MOZ_ASSERT(totalRead == aLength); 1638 1639 nsCOMPtr<nsIRunnable> dataAvailable = 1640 new nsHtml5DataAvailable(this, std::move(data)); 1641 if (NS_FAILED(mEventTarget->Dispatch(dataAvailable, 1642 nsIThread::DISPATCH_NORMAL))) { 1643 NS_WARNING("Dispatching DataAvailable event failed."); 1644 } 1645 return rv; 1646 } 1647 1648 MOZ_ASSERT(IsParserThread(), "Wrong thread!"); 1649 mozilla::MutexAutoLock autoLock(mTokenizerMutex); 1650 1651 if (NS_FAILED(rv = mTreeBuilder->IsBroken())) { 1652 return rv; 1653 } 1654 1655 // Since we're getting OnDataAvailable directly on the parser thread, 1656 // there is no nsHtml5DataAvailable that would call PostLoadFlusher. 1657 // Hence, we need to call PostLoadFlusher() before this method returns. 1658 // Braces for RAII clarity relative to the mutex despite not being 1659 // strictly necessary. 1660 { 1661 auto speculationFlusher = MakeScopeExit([&] { PostLoadFlusher(); }); 1662 1663 if (mBufferingBytes) { 1664 Maybe<Buffer<uint8_t>> maybe = Buffer<uint8_t>::Alloc(aLength); 1665 if (maybe.isNothing()) { 1666 MarkAsBroken(NS_ERROR_OUT_OF_MEMORY); 1667 return NS_ERROR_OUT_OF_MEMORY; 1668 } 1669 Buffer<uint8_t> data(std::move(*maybe)); 1670 rv = aInStream->Read(reinterpret_cast<char*>(data.Elements()), 1671 data.Length(), &totalRead); 1672 NS_ENSURE_SUCCESS(rv, rv); 1673 MOZ_ASSERT(totalRead == aLength); 1674 DoDataAvailableBuffer(std::move(data)); 1675 return rv; 1676 } 1677 // Read directly from response buffer. 1678 rv = aInStream->ReadSegments(CopySegmentsToParser, this, aLength, 1679 &totalRead); 1680 NS_ENSURE_SUCCESS(rv, rv); 1681 MOZ_ASSERT(totalRead == aLength); 1682 return rv; 1683 } 1684 } 1685 1686 // Called under lock by function ptr 1687 /* static */ 1688 nsresult nsHtml5StreamParser::CopySegmentsToParser( 1689 nsIInputStream* aInStream, void* aClosure, const char* aFromSegment, 1690 uint32_t aToOffset, uint32_t aCount, 1691 uint32_t* aWriteCount) MOZ_NO_THREAD_SAFETY_ANALYSIS { 1692 nsHtml5StreamParser* parser = static_cast<nsHtml5StreamParser*>(aClosure); 1693 1694 parser->DoDataAvailable(AsBytes(Span(aFromSegment, aCount))); 1695 // Assume DoDataAvailable consumed all available bytes. 1696 *aWriteCount = aCount; 1697 return NS_OK; 1698 } 1699 1700 const Encoding* nsHtml5StreamParser::PreferredForInternalEncodingDecl( 1701 const nsAString& aEncoding) { 1702 const Encoding* newEncoding = Encoding::ForLabel(aEncoding); 1703 if (!newEncoding) { 1704 // the encoding name is bogus 1705 mTreeBuilder->MaybeComplainAboutCharset("EncMetaUnsupported", true, 1706 mTokenizer->getLineNumber()); 1707 return nullptr; 1708 } 1709 1710 if (newEncoding == UTF_16BE_ENCODING || newEncoding == UTF_16LE_ENCODING) { 1711 mTreeBuilder->MaybeComplainAboutCharset("EncMetaUtf16", true, 1712 mTokenizer->getLineNumber()); 1713 newEncoding = UTF_8_ENCODING; 1714 } 1715 1716 if (newEncoding == X_USER_DEFINED_ENCODING) { 1717 // WebKit/Blink hack for Indian and Armenian legacy sites 1718 mTreeBuilder->MaybeComplainAboutCharset("EncMetaUserDefined", true, 1719 mTokenizer->getLineNumber()); 1720 newEncoding = WINDOWS_1252_ENCODING; 1721 } 1722 1723 if (newEncoding == REPLACEMENT_ENCODING) { 1724 // No line number, because the replacement encoding doesn't allow 1725 // showing the lines. 1726 mTreeBuilder->MaybeComplainAboutCharset("EncMetaReplacement", true, 0); 1727 } 1728 1729 return newEncoding; 1730 } 1731 1732 bool nsHtml5StreamParser::internalEncodingDeclaration(nsHtml5String aEncoding) { 1733 MOZ_ASSERT(IsParserThread(), "Wrong thread!"); 1734 if ((mCharsetSource >= kCharsetFromMetaTag && 1735 mCharsetSource != kCharsetFromFinalAutoDetectionFile) || 1736 mSeenEligibleMetaCharset) { 1737 return false; 1738 } 1739 1740 nsString newEncoding; // Not Auto, because using it to hold nsStringBuffer* 1741 aEncoding.ToString(newEncoding); 1742 auto encoding = PreferredForInternalEncodingDecl(newEncoding); 1743 if (!encoding) { 1744 return false; 1745 } 1746 1747 mSeenEligibleMetaCharset = true; 1748 1749 if (!mLookingForMetaCharset) { 1750 if (mInitialEncodingWasFromParentFrame) { 1751 mTreeBuilder->MaybeComplainAboutCharset("EncMetaTooLateFrame", true, 1752 mTokenizer->getLineNumber()); 1753 } else { 1754 mTreeBuilder->MaybeComplainAboutCharset("EncMetaTooLate", true, 1755 mTokenizer->getLineNumber()); 1756 } 1757 return false; 1758 } 1759 if (mTemplatePushedOrHeadPopped) { 1760 mTreeBuilder->MaybeComplainAboutCharset("EncMetaAfterHeadInKilobyte", false, 1761 mTokenizer->getLineNumber()); 1762 } 1763 1764 if (mForceAutoDetection && 1765 (encoding->IsAsciiCompatible() || encoding == ISO_2022_JP_ENCODING)) { 1766 return false; 1767 } 1768 1769 mNeedsEncodingSwitchTo = encoding; 1770 mEncodingSwitchSource = kCharsetFromMetaTag; 1771 return true; 1772 } 1773 1774 bool nsHtml5StreamParser::TemplatePushedOrHeadPopped() { 1775 MOZ_ASSERT( 1776 IsParserThread() || mMode == PLAIN_TEXT || mMode == VIEW_SOURCE_PLAIN, 1777 "Wrong thread!"); 1778 mTemplatePushedOrHeadPopped = true; 1779 return mNumBytesBuffered >= UNCONDITIONAL_META_SCAN_BOUNDARY; 1780 } 1781 1782 void nsHtml5StreamParser::RememberGt(int32_t aPos) { 1783 if (mLookingForMetaCharset) { 1784 mGtBuffer = mFirstBuffer; 1785 mGtPos = aPos; 1786 } 1787 } 1788 1789 void nsHtml5StreamParser::PostLoadFlusher() { 1790 MOZ_ASSERT(IsParserThread(), "Wrong thread!"); 1791 mTokenizerMutex.AssertCurrentThreadOwns(); 1792 1793 mTreeBuilder->FlushLoads(); 1794 // Dispatch this runnable unconditionally, because the loads 1795 // that need flushing may have been flushed earlier even if the 1796 // flush right above here did nothing. (Is this still true?) 1797 nsCOMPtr<nsIRunnable> runnable(mLoadFlusher); 1798 if (NS_FAILED( 1799 DispatchToMain(CreateRenderBlockingRunnable(runnable.forget())))) { 1800 NS_WARNING("failed to dispatch load flush event"); 1801 } 1802 1803 if ((mMode == VIEW_SOURCE_HTML || mMode == VIEW_SOURCE_XML) && 1804 mTokenizer->ShouldFlushViewSource()) { 1805 auto r = mTreeBuilder->Flush(); // delete useless ops 1806 MOZ_ASSERT(r.isOk(), "Should have null sink with View Source"); 1807 r = mTokenizer->FlushViewSource(); 1808 if (r.isErr()) { 1809 MarkAsBroken(r.unwrapErr()); 1810 return; 1811 } 1812 if (r.unwrap()) { 1813 nsCOMPtr<nsIRunnable> runnable(mExecutorFlusher); 1814 if (NS_FAILED(DispatchToMain(runnable.forget()))) { 1815 NS_WARNING("failed to dispatch executor flush event"); 1816 } 1817 } 1818 } 1819 } 1820 1821 void nsHtml5StreamParser::FlushTreeOpsAndDisarmTimer() { 1822 MOZ_ASSERT(IsParserThread(), "Wrong thread!"); 1823 if (mFlushTimerArmed) { 1824 // avoid calling Cancel if the flush timer isn't armed to avoid acquiring 1825 // a mutex 1826 { 1827 mozilla::MutexAutoLock flushTimerLock(mFlushTimerMutex); 1828 mFlushTimer->Cancel(); 1829 } 1830 mFlushTimerArmed = false; 1831 } 1832 if (mMode == VIEW_SOURCE_HTML || mMode == VIEW_SOURCE_XML) { 1833 auto r = mTokenizer->FlushViewSource(); 1834 if (r.isErr()) { 1835 MarkAsBroken(r.unwrapErr()); 1836 } 1837 } 1838 auto r = mTreeBuilder->Flush(); 1839 if (r.isErr()) { 1840 MarkAsBroken(r.unwrapErr()); 1841 } 1842 nsCOMPtr<nsIRunnable> runnable(mExecutorFlusher); 1843 if (NS_FAILED(DispatchToMain(runnable.forget()))) { 1844 NS_WARNING("failed to dispatch executor flush event"); 1845 } 1846 } 1847 1848 void nsHtml5StreamParser::SwitchDecoderIfAsciiSoFar( 1849 NotNull<const Encoding*> aEncoding) { 1850 if (mEncoding == aEncoding) { 1851 MOZ_ASSERT(!mStartedFeedingDevTools); 1852 // Report all already-decoded buffers to the dev tools if needed. 1853 if (mURIToSendToDevtools) { 1854 nsHtml5OwningUTF16Buffer* buffer = mFirstBufferOfMetaScan; 1855 while (buffer) { 1856 auto s = Span(buffer->getBuffer(), buffer->getEnd()); 1857 OnNewContent(s); 1858 buffer = buffer->next; 1859 } 1860 } 1861 return; 1862 } 1863 if (!mEncoding->IsAsciiCompatible() || !aEncoding->IsAsciiCompatible()) { 1864 return; 1865 } 1866 size_t numAscii = 0; 1867 MOZ_ASSERT(mFirstBufferOfMetaScan, 1868 "Why did we come here without starting meta scan?"); 1869 nsHtml5OwningUTF16Buffer* buffer = mFirstBufferOfMetaScan; 1870 while (buffer != mFirstBuffer) { 1871 MOZ_ASSERT(buffer, "mFirstBuffer should have acted as sentinel!"); 1872 MOZ_ASSERT(buffer->getStart() == buffer->getEnd(), 1873 "Why wasn't an early buffer fully consumed?"); 1874 auto s = Span(buffer->getBuffer(), buffer->getStart()); 1875 if (!IsAscii(s)) { 1876 return; 1877 } 1878 numAscii += s.Length(); 1879 buffer = buffer->next; 1880 } 1881 auto s = Span(mFirstBuffer->getBuffer(), mFirstBuffer->getStart()); 1882 if (!IsAscii(s)) { 1883 return; 1884 } 1885 numAscii += s.Length(); 1886 1887 MOZ_ASSERT(!mStartedFeedingDevTools); 1888 // Report the ASCII prefix to dev tools if needed 1889 if (mURIToSendToDevtools) { 1890 buffer = mFirstBufferOfMetaScan; 1891 while (buffer != mFirstBuffer) { 1892 MOZ_ASSERT(buffer, "mFirstBuffer should have acted as sentinel!"); 1893 MOZ_ASSERT(buffer->getStart() == buffer->getEnd(), 1894 "Why wasn't an early buffer fully consumed?"); 1895 auto s = Span(buffer->getBuffer(), buffer->getStart()); 1896 OnNewContent(s); 1897 buffer = buffer->next; 1898 } 1899 auto s = Span(mFirstBuffer->getBuffer(), mFirstBuffer->getStart()); 1900 OnNewContent(s); 1901 } 1902 1903 // Success! Now let's get rid of the already-decoded but not tokenized data: 1904 mFirstBuffer->setEnd(mFirstBuffer->getStart()); 1905 mLastBuffer = mFirstBuffer; 1906 mFirstBuffer->next = nullptr; 1907 1908 // Note: We could have scanned further for ASCII, which could avoid some 1909 // buffer deallocation and reallocation. However, chances are that if we got 1910 // until meta without non-ASCII before, there's going to be a title with 1911 // non-ASCII soon after anyway, so let's avoid the complexity of finding out. 1912 1913 MOZ_ASSERT(mUnicodeDecoder, "How come we scanned meta without a decoder?"); 1914 mEncoding = aEncoding; 1915 mEncoding->NewDecoderWithoutBOMHandlingInto(*mUnicodeDecoder); 1916 mHasHadErrors = false; 1917 1918 MOZ_ASSERT(!mDecodingLocalFileWithoutTokenizing, 1919 "Must have set mDecodingLocalFileWithoutTokenizing to false to " 1920 "report data to dev tools below"); 1921 MOZ_ASSERT(!mLookingForMetaCharset, 1922 "Must have set mLookingForMetaCharset to false to report data to " 1923 "dev tools below"); 1924 1925 // Now skip over as many bytes and redecode the tail of the 1926 // buffered bytes. 1927 size_t skipped = 0; 1928 for (auto&& buffer : mBufferedBytes) { 1929 size_t nextSkipped = skipped + buffer.Length(); 1930 if (nextSkipped <= numAscii) { 1931 skipped = nextSkipped; 1932 continue; 1933 } 1934 if (skipped >= numAscii) { 1935 WriteStreamBytes(buffer); 1936 skipped = nextSkipped; 1937 continue; 1938 } 1939 size_t tailLength = nextSkipped - numAscii; 1940 WriteStreamBytes(Span<uint8_t>(buffer).From(buffer.Length() - tailLength)); 1941 skipped = nextSkipped; 1942 } 1943 } 1944 1945 size_t nsHtml5StreamParser::CountGts() { 1946 if (!mGtBuffer) { 1947 return 0; 1948 } 1949 size_t gts = 0; 1950 nsHtml5OwningUTF16Buffer* buffer = mFirstBufferOfMetaScan; 1951 for (;;) { 1952 MOZ_ASSERT(buffer, "How did we walk past mGtBuffer?"); 1953 char16_t* buf = buffer->getBuffer(); 1954 if (buffer == mGtBuffer) { 1955 for (int32_t i = 0; i <= mGtPos; ++i) { 1956 if (buf[i] == u'>') { 1957 ++gts; 1958 } 1959 } 1960 break; 1961 } 1962 for (int32_t i = 0; i < buffer->getEnd(); ++i) { 1963 if (buf[i] == u'>') { 1964 ++gts; 1965 } 1966 } 1967 buffer = buffer->next; 1968 } 1969 return gts; 1970 } 1971 1972 void nsHtml5StreamParser::DiscardMetaSpeculation() { 1973 mozilla::MutexAutoLock speculationAutoLock(mSpeculationMutex); 1974 // Rewind the stream 1975 MOZ_ASSERT(!mAtEOF, "How did we end up setting this?"); 1976 mTokenizer->resetToDataState(); 1977 mTokenizer->setLineNumber(1); 1978 mLastWasCR = false; 1979 1980 if (mMode == PLAIN_TEXT || mMode == VIEW_SOURCE_PLAIN) { 1981 // resetToDataState() above logically rewinds to the state before 1982 // the plain text start, so we need to start plain text again to 1983 // put the tokenizer into the plain text state. 1984 mTokenizer->StartPlainText(); 1985 } 1986 1987 mFirstBuffer = mLastBuffer; 1988 mFirstBuffer->setStart(0); 1989 mFirstBuffer->setEnd(0); 1990 mFirstBuffer->next = nullptr; 1991 1992 mTreeBuilder->flushCharacters(); // empty the pending buffer 1993 mTreeBuilder->ClearOps(); // now get rid of the failed ops 1994 1995 if (mMode == VIEW_SOURCE_HTML) { 1996 mTokenizer->RewindViewSource(); 1997 } 1998 1999 { 2000 // We know that this resets the tree builder back to the start state. 2001 // This must happen _after_ the flushCharacters() call above! 2002 const auto& speculation = mSpeculations.ElementAt(0); 2003 mTreeBuilder->loadState(speculation->GetSnapshot()); 2004 } 2005 2006 // Experimentation suggests that we don't need to do anything special 2007 // for ignoring the leading LF in View Source here. 2008 2009 mSpeculations.Clear(); // potentially a huge number of destructors 2010 // run here synchronously... 2011 2012 // Now set up a new speculation for the main thread to find. 2013 // Note that we stay in the speculating state, because the main thread 2014 // knows how to come out of that state and this thread does not. 2015 2016 nsHtml5Speculation* speculation = new nsHtml5Speculation( 2017 mFirstBuffer, mFirstBuffer->getStart(), mTokenizer->getLineNumber(), 2018 mTokenizer->getColumnNumber(), mTreeBuilder->newSnapshot()); 2019 MOZ_ASSERT(!mFlushTimerArmed, "How did we end up arming the timer?"); 2020 if (mMode == VIEW_SOURCE_HTML) { 2021 mTokenizer->SetViewSourceOpSink(speculation); 2022 mTokenizer->StartViewSourceBodyContents(); 2023 } else { 2024 MOZ_ASSERT(mMode != VIEW_SOURCE_XML); 2025 mTreeBuilder->SetOpSink(speculation); 2026 } 2027 mSpeculations.AppendElement(speculation); // adopts the pointer 2028 MOZ_ASSERT(mSpeculating, "How did we end speculating?"); 2029 } 2030 2031 /* 2032 * The general idea is to match WebKit and Blink exactly for meta 2033 * scan except: 2034 * 2035 * 1. WebKit and Blink look for meta as if scripting was disabled 2036 * for `noscript` purposes. This implementation matches the 2037 * `noscript` treatment of the observable DOM building (in order 2038 * to be able to use the same tree builder run). 2039 * 2. WebKit and Blink look for meta as if the foreign content 2040 * feedback from the tree builder to the tokenizer didn't exist. 2041 * This implementation considers the foreign content rules in 2042 * order to be able to use the same tree builder run for meta 2043 * and the observable DOM building. Note that since <svg> and 2044 * <math> imply the end of head, this only matters for meta after 2045 * head but starting within the 1024-byte zone. 2046 * 2047 * Template is treated specially, because that WebKit/Blink behavior 2048 * is easy to emulate unlike the above two exceptions. In general, 2049 * the meta scan token handler in WebKit and Blink behaves as if there 2050 * was a scripting-disabled tree builder predating the introduction 2051 * of foreign content and template. 2052 * 2053 * Meta is honored if it _starts_ within the first 1024 kilobytes or, 2054 * if by the 1024-byte boundary head hasn't ended and a template 2055 * element hasn't started, a meta occurs before the first of the head 2056 * ending or a template element starting. 2057 * 2058 * If a meta isn't honored according to the above definition, and 2059 * we aren't dealing with plain text, the buffered bytes, which by 2060 * now have to contain `>` character unless we encountered EOF, are 2061 * scanned for syntax resembling an XML declaration. 2062 * 2063 * If neither a meta nor syntax resembling an XML declaration has 2064 * been honored and we aren't inheriting the encoding from a 2065 * same-origin parent or parsing for XHR, chardetng is used. 2066 * chardetng runs first for the part of the document that was searched 2067 * for meta and then at EOF. The part searched for meta is defined as 2068 * follows in order to avoid network buffer boundary-dependent 2069 * behavior: 2070 * 2071 * 1. At least the first 1024 bytes. (This is what happens for plain 2072 * text.) 2073 * 2. If the 1024-byte boundary is within a tag, comment, doctype, 2074 * or CDATA section, at least up to the end of that token or CDATA 2075 * section. (Exception: If the 1024-byte boundary is in an RCDATA 2076 * end tag that hasn't yet been decided to be an end tag, the 2077 * token is not considered.) 2078 * 3. If at the 1024-byte boundary, head hasn't ended and there hasn't 2079 * been a template tag, up to the end of the first template tag 2080 * or token ending the head, whichever comes first. 2081 * 4. Except if head is ended by a text token, only to the end of the 2082 * most recent tag, comment, or doctype token. (Because text is 2083 * coalesced, so it would be harder to correlate the text to the 2084 * bytes.) 2085 * 2086 * An encoding-related reload is still possible if chardetng's guess 2087 * at EOF differs from its initial guess. 2088 */ 2089 bool nsHtml5StreamParser::ProcessLookingForMetaCharset(bool aEof) { 2090 MOZ_ASSERT(mBomState == BOM_SNIFFING_OVER); 2091 MOZ_ASSERT(mMode != VIEW_SOURCE_XML); 2092 bool rewound = false; 2093 MOZ_ASSERT(mForceAutoDetection || 2094 mCharsetSource < kCharsetFromInitialAutoDetectionASCII || 2095 mCharsetSource == kCharsetFromParentFrame, 2096 "Why are we looking for meta charset if we've seen it?"); 2097 // NOTE! We may come here multiple times with 2098 // mNumBytesBuffered == UNCONDITIONAL_META_SCAN_BOUNDARY 2099 // if the tokenizer suspends multiple times after decoding has reached 2100 // mNumBytesBuffered == UNCONDITIONAL_META_SCAN_BOUNDARY. That's why 2101 // we need to also check whether the we are at the end of the last 2102 // decoded buffer. 2103 // Note that DoDataAvailableBuffer() ensures that the code here has 2104 // the opportunity to run at the exact UNCONDITIONAL_META_SCAN_BOUNDARY 2105 // even if there isn't a network buffer boundary there. 2106 bool atKilobyte = false; 2107 if ((mNumBytesBuffered == UNCONDITIONAL_META_SCAN_BOUNDARY && 2108 mFirstBuffer == mLastBuffer && !mFirstBuffer->hasMore())) { 2109 atKilobyte = true; 2110 mTokenizer->AtKilobyteBoundary(); 2111 } 2112 if (!mNeedsEncodingSwitchTo && 2113 (aEof || (mTemplatePushedOrHeadPopped && 2114 !mTokenizer->IsInTokenStartedAtKilobyteBoundary() && 2115 (atKilobyte || 2116 mNumBytesBuffered > UNCONDITIONAL_META_SCAN_BOUNDARY)))) { 2117 // meta charset was not found 2118 mLookingForMetaCharset = false; 2119 if (mStartsWithLtQuestion && mCharsetSource < kCharsetFromXmlDeclaration) { 2120 // Look for bogo XML declaration. 2121 // Search the first buffer in the hope that '>' is within it. 2122 MOZ_ASSERT(!mBufferedBytes.IsEmpty(), 2123 "How did at least <? not get buffered?"); 2124 Buffer<uint8_t>& first = mBufferedBytes[0]; 2125 const Encoding* encoding = 2126 xmldecl_parse(first.Elements(), first.Length()); 2127 if (!encoding) { 2128 // Our bogo XML declaration scanner wants to see a contiguous buffer, so 2129 // let's linearize the data. (Ideally, the XML declaration scanner would 2130 // be incremental, but this is the rare path anyway.) 2131 Vector<uint8_t> contiguous; 2132 if (!contiguous.append(first.Elements(), first.Length())) { 2133 MarkAsBroken(NS_ERROR_OUT_OF_MEMORY); 2134 return false; 2135 } 2136 for (size_t i = 1; i < mBufferedBytes.Length(); ++i) { 2137 Buffer<uint8_t>& buffer = mBufferedBytes[i]; 2138 const uint8_t* elements = buffer.Elements(); 2139 size_t length = buffer.Length(); 2140 const uint8_t* lt = (const uint8_t*)memchr(elements, '>', length); 2141 bool stop = false; 2142 if (lt) { 2143 length = (lt - elements) + 1; 2144 stop = true; 2145 } 2146 if (!contiguous.append(elements, length)) { 2147 MarkAsBroken(NS_ERROR_OUT_OF_MEMORY); 2148 return false; 2149 } 2150 if (stop) { 2151 // Avoid linearizing all buffered bytes unnecessarily. 2152 break; 2153 } 2154 } 2155 encoding = xmldecl_parse(contiguous.begin(), contiguous.length()); 2156 } 2157 if (encoding) { 2158 if (!(mForceAutoDetection && (encoding->IsAsciiCompatible() || 2159 encoding == ISO_2022_JP_ENCODING))) { 2160 mForceAutoDetection = false; 2161 mNeedsEncodingSwitchTo = encoding; 2162 mEncodingSwitchSource = kCharsetFromXmlDeclaration; 2163 } 2164 } 2165 } 2166 // Check again in case we found an encoding in the bogo XML declaration. 2167 if (!mNeedsEncodingSwitchTo && 2168 (mForceAutoDetection || 2169 mCharsetSource < kCharsetFromInitialAutoDetectionASCII) && 2170 !(mMode == LOAD_AS_DATA || mMode == VIEW_SOURCE_XML) && 2171 !(mDecodingLocalFileWithoutTokenizing && !aEof && 2172 mNumBytesBuffered <= LOCAL_FILE_UTF_8_BUFFER_SIZE)) { 2173 MOZ_ASSERT(!mStartedFeedingDetector); 2174 if (mNumBytesBuffered == UNCONDITIONAL_META_SCAN_BOUNDARY || aEof) { 2175 // We know that all the buffered bytes have been tokenized, so feed 2176 // them all to chardetng. 2177 for (auto&& buffer : mBufferedBytes) { 2178 FeedDetector(buffer); 2179 } 2180 if (aEof) { 2181 MOZ_ASSERT(!mChardetEof); 2182 DetectorEof(); 2183 } 2184 auto [encoding, source] = GuessEncoding(true); 2185 mNeedsEncodingSwitchTo = encoding; 2186 mEncodingSwitchSource = source; 2187 } else if (mNumBytesBuffered > UNCONDITIONAL_META_SCAN_BOUNDARY) { 2188 size_t gtsLeftToFind = CountGts(); 2189 size_t bytesSeen = 0; 2190 // We sync the bytes to the UTF-16 code units seen to avoid depending 2191 // on network buffer boundaries. We do the syncing by counting '>' 2192 // bytes / code units. However, we always scan at least 1024 bytes. 2193 // The 1024-byte boundary is guaranteed to be between buffers. 2194 // The guarantee is implemented in DoDataAvailableBuffer(). 2195 for (auto&& buffer : mBufferedBytes) { 2196 if (!mNeedsEncodingSwitchTo) { 2197 if (gtsLeftToFind) { 2198 auto span = buffer.AsSpan(); 2199 bool feed = true; 2200 for (size_t i = 0; i < span.Length(); ++i) { 2201 if (span[i] == uint8_t('>')) { 2202 --gtsLeftToFind; 2203 if (!gtsLeftToFind) { 2204 if (bytesSeen < UNCONDITIONAL_META_SCAN_BOUNDARY) { 2205 break; 2206 } 2207 ++i; // Skip the gt 2208 FeedDetector(span.To(i)); 2209 auto [encoding, source] = GuessEncoding(true); 2210 mNeedsEncodingSwitchTo = encoding; 2211 mEncodingSwitchSource = source; 2212 FeedDetector(span.From(i)); 2213 bytesSeen += buffer.Length(); 2214 // No need to update bytesSeen anymore, but let's do it for 2215 // debugging. 2216 // We should do `continue outer;` but C++ can't. 2217 feed = false; 2218 break; 2219 } 2220 } 2221 } 2222 if (feed) { 2223 FeedDetector(buffer); 2224 bytesSeen += buffer.Length(); 2225 } 2226 continue; 2227 } 2228 if (bytesSeen == UNCONDITIONAL_META_SCAN_BOUNDARY) { 2229 auto [encoding, source] = GuessEncoding(true); 2230 mNeedsEncodingSwitchTo = encoding; 2231 mEncodingSwitchSource = source; 2232 } 2233 } 2234 FeedDetector(buffer); 2235 bytesSeen += buffer.Length(); 2236 } 2237 } 2238 MOZ_ASSERT(mNeedsEncodingSwitchTo, 2239 "How come we didn't call GuessEncoding()?"); 2240 } 2241 } 2242 if (mNeedsEncodingSwitchTo) { 2243 mDecodingLocalFileWithoutTokenizing = false; 2244 mLookingForMetaCharset = false; 2245 2246 auto needsEncodingSwitchTo = WrapNotNull(mNeedsEncodingSwitchTo); 2247 mNeedsEncodingSwitchTo = nullptr; 2248 2249 SwitchDecoderIfAsciiSoFar(needsEncodingSwitchTo); 2250 // The above line may have changed mEncoding so that mEncoding equals 2251 // needsEncodingSwitchTo. 2252 2253 mCharsetSource = mEncodingSwitchSource; 2254 2255 if (mMode == VIEW_SOURCE_HTML) { 2256 auto r = mTokenizer->FlushViewSource(); 2257 if (r.isErr()) { 2258 MarkAsBroken(r.unwrapErr()); 2259 return false; 2260 } 2261 } 2262 auto r = mTreeBuilder->Flush(); 2263 if (r.isErr()) { 2264 MarkAsBroken(r.unwrapErr()); 2265 return false; 2266 } 2267 2268 if (mEncoding != needsEncodingSwitchTo) { 2269 // Speculation failed 2270 rewound = true; 2271 2272 if (mEncoding == ISO_2022_JP_ENCODING || 2273 needsEncodingSwitchTo == ISO_2022_JP_ENCODING) { 2274 // Chances are no Web author will fix anything due to this message, so 2275 // this is here to help understanding issues when debugging sites made 2276 // by someone else. 2277 mTreeBuilder->MaybeComplainAboutCharset("EncSpeculationFail2022", false, 2278 mTokenizer->getLineNumber()); 2279 } else { 2280 if (mCharsetSource == kCharsetFromMetaTag) { 2281 mTreeBuilder->MaybeComplainAboutCharset( 2282 "EncSpeculationFailMeta", false, mTokenizer->getLineNumber()); 2283 } else if (mCharsetSource == kCharsetFromXmlDeclaration) { 2284 // This intentionally refers to the line number of how far ahead 2285 // the document was parsed even though the bogo XML decl is always 2286 // on line 1. 2287 mTreeBuilder->MaybeComplainAboutCharset( 2288 "EncSpeculationFailXml", false, mTokenizer->getLineNumber()); 2289 } 2290 } 2291 2292 DiscardMetaSpeculation(); 2293 // Redecode the stream. 2294 mEncoding = needsEncodingSwitchTo; 2295 mUnicodeDecoder = mEncoding->NewDecoderWithBOMRemoval(); 2296 mHasHadErrors = false; 2297 2298 MOZ_ASSERT(!mDecodingLocalFileWithoutTokenizing, 2299 "Must have set mDecodingLocalFileWithoutTokenizing to false " 2300 "to report data to dev tools below"); 2301 MOZ_ASSERT(!mLookingForMetaCharset, 2302 "Must have set mLookingForMetaCharset to false to report data " 2303 "to dev tools below"); 2304 for (auto&& buffer : mBufferedBytes) { 2305 nsresult rv = WriteStreamBytes(buffer); 2306 if (NS_FAILED(rv)) { 2307 MarkAsBroken(rv); 2308 return false; 2309 } 2310 } 2311 } 2312 } else if (!mLookingForMetaCharset && !mDecodingLocalFileWithoutTokenizing) { 2313 MOZ_ASSERT(!mStartedFeedingDevTools); 2314 // Report all already-decoded buffers to the dev tools if needed. 2315 if (mURIToSendToDevtools) { 2316 nsHtml5OwningUTF16Buffer* buffer = mFirstBufferOfMetaScan; 2317 while (buffer) { 2318 auto s = Span(buffer->getBuffer(), buffer->getEnd()); 2319 OnNewContent(s); 2320 buffer = buffer->next; 2321 } 2322 } 2323 } 2324 if (!mLookingForMetaCharset) { 2325 mGtBuffer = nullptr; 2326 mGtPos = 0; 2327 2328 if (!mDecodingLocalFileWithoutTokenizing) { 2329 mFirstBufferOfMetaScan = nullptr; 2330 mBufferingBytes = false; 2331 mBufferedBytes.Clear(); 2332 mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource, true); 2333 if (mMode == VIEW_SOURCE_HTML) { 2334 auto r = mTokenizer->FlushViewSource(); 2335 if (r.isErr()) { 2336 MarkAsBroken(r.unwrapErr()); 2337 return false; 2338 } 2339 } 2340 auto r = mTreeBuilder->Flush(); 2341 if (r.isErr()) { 2342 MarkAsBroken(r.unwrapErr()); 2343 return false; 2344 } 2345 } 2346 } 2347 return rewound; 2348 } 2349 2350 void nsHtml5StreamParser::ParseAvailableData() { 2351 MOZ_ASSERT(IsParserThread(), "Wrong thread!"); 2352 mTokenizerMutex.AssertCurrentThreadOwns(); 2353 MOZ_ASSERT(!(mDecodingLocalFileWithoutTokenizing && !mLookingForMetaCharset)); 2354 2355 if (IsTerminatedOrInterrupted()) { 2356 return; 2357 } 2358 2359 if (mSpeculating && !IsSpeculationEnabled()) { 2360 return; 2361 } 2362 2363 bool requestedReload = false; 2364 for (;;) { 2365 if (!mFirstBuffer->hasMore()) { 2366 if (mFirstBuffer == mLastBuffer) { 2367 switch (mStreamState) { 2368 case STREAM_BEING_READ: 2369 // never release the last buffer. 2370 if (!mSpeculating) { 2371 // reuse buffer space if not speculating 2372 mFirstBuffer->setStart(0); 2373 mFirstBuffer->setEnd(0); 2374 } 2375 return; // no more data for now but expecting more 2376 case STREAM_ENDED: 2377 if (mAtEOF) { 2378 return; 2379 } 2380 if (mLookingForMetaCharset) { 2381 // When called with aEof=true, ProcessLookingForMetaCharset() 2382 // is guaranteed to set mLookingForMetaCharset to false so 2383 // that we can't come here twice. 2384 if (ProcessLookingForMetaCharset(true)) { 2385 if (IsTerminatedOrInterrupted()) { 2386 return; 2387 } 2388 continue; 2389 } 2390 } else if ((mForceAutoDetection || 2391 mCharsetSource < kCharsetFromParentFrame) && 2392 !(mMode == LOAD_AS_DATA || mMode == VIEW_SOURCE_XML) && 2393 !mReparseForbidden) { 2394 // An earlier DetectorEof() call is possible in which case 2395 // the one here is a no-op. 2396 DetectorEof(); 2397 auto [encoding, source] = GuessEncoding(false); 2398 if (encoding != mEncoding) { 2399 // Request a reload from the docshell. 2400 MOZ_ASSERT( 2401 (source >= 2402 kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8InitialWasASCII && 2403 source <= 2404 kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLDInitialWasASCII) || 2405 source == kCharsetFromFinalUserForcedAutoDetection); 2406 mTreeBuilder->NeedsCharsetSwitchTo(encoding, source, 0); 2407 requestedReload = true; 2408 } else if (mCharsetSource == 2409 kCharsetFromInitialAutoDetectionASCII && 2410 mDetectorHasSeenNonAscii) { 2411 mCharsetSource = source; 2412 mTreeBuilder->UpdateCharsetSource(mCharsetSource); 2413 } 2414 } 2415 2416 mAtEOF = true; 2417 if (!mForceAutoDetection && !requestedReload) { 2418 if (mCharsetSource == kCharsetFromParentFrame) { 2419 mTreeBuilder->MaybeComplainAboutCharset("EncNoDeclarationFrame", 2420 false, 0); 2421 } else if (mCharsetSource == kCharsetFromXmlDeclaration) { 2422 // We know the bogo XML decl is always on the first line. 2423 mTreeBuilder->MaybeComplainAboutCharset("EncXmlDecl", false, 1); 2424 } else if ( 2425 mCharsetSource >= 2426 kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8 && 2427 mCharsetSource <= 2428 kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD) { 2429 if (mMode == PLAIN_TEXT || mMode == VIEW_SOURCE_PLAIN) { 2430 mTreeBuilder->MaybeComplainAboutCharset("EncNoDeclPlain", 2431 true, 0); 2432 } else { 2433 mTreeBuilder->MaybeComplainAboutCharset("EncNoDecl", true, 0); 2434 } 2435 } 2436 2437 if (mHasHadErrors && mEncoding != REPLACEMENT_ENCODING) { 2438 if (mEncoding == UTF_8_ENCODING) { 2439 mTreeBuilder->TryToEnableEncodingMenu(); 2440 } 2441 if (mCharsetSource == kCharsetFromParentFrame) { 2442 if (mMode == PLAIN_TEXT || mMode == VIEW_SOURCE_PLAIN) { 2443 mTreeBuilder->MaybeComplainAboutCharset( 2444 "EncErrorFramePlain", true, 0); 2445 } else { 2446 mTreeBuilder->MaybeComplainAboutCharset("EncErrorFrame", 2447 true, 0); 2448 } 2449 } else if ( 2450 mCharsetSource >= kCharsetFromXmlDeclaration && 2451 !(mCharsetSource >= 2452 kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8InitialWasASCII && 2453 mCharsetSource <= 2454 kCharsetFromFinalUserForcedAutoDetection)) { 2455 mTreeBuilder->MaybeComplainAboutCharset("EncError", true, 0); 2456 } 2457 } 2458 } 2459 if (NS_SUCCEEDED(mTreeBuilder->IsBroken())) { 2460 mTokenizer->eof(); 2461 nsresult rv; 2462 if (NS_FAILED((rv = mTreeBuilder->IsBroken()))) { 2463 MarkAsBroken(rv); 2464 } else { 2465 mTreeBuilder->StreamEnded(); 2466 if (mMode == VIEW_SOURCE_HTML || mMode == VIEW_SOURCE_XML) { 2467 if (!mTokenizer->EndViewSource()) { 2468 MarkAsBroken(NS_ERROR_OUT_OF_MEMORY); 2469 } 2470 } 2471 } 2472 } 2473 FlushTreeOpsAndDisarmTimer(); 2474 return; // no more data and not expecting more 2475 default: 2476 MOZ_ASSERT_UNREACHABLE("It should be impossible to reach this."); 2477 return; 2478 } 2479 } 2480 mFirstBuffer = mFirstBuffer->next; 2481 continue; 2482 } 2483 2484 // now we have a non-empty buffer 2485 mFirstBuffer->adjust(mLastWasCR); 2486 mLastWasCR = false; 2487 if (mFirstBuffer->hasMore()) { 2488 if (!mTokenizer->EnsureBufferSpace(mFirstBuffer->getLength())) { 2489 MarkAsBroken(NS_ERROR_OUT_OF_MEMORY); 2490 return; 2491 } 2492 mLastWasCR = mTokenizer->tokenizeBuffer(mFirstBuffer); 2493 nsresult rv; 2494 if (NS_FAILED((rv = mTreeBuilder->IsBroken()))) { 2495 MarkAsBroken(rv); 2496 return; 2497 } 2498 if (mTreeBuilder->HasScriptThatMayDocumentWriteOrBlock()) { 2499 // `HasScriptThatMayDocumentWriteOrBlock()` cannot return true if the 2500 // tree builder is preventing script execution. 2501 MOZ_ASSERT(mMode == NORMAL); 2502 mozilla::MutexAutoLock speculationAutoLock(mSpeculationMutex); 2503 nsHtml5Speculation* speculation = new nsHtml5Speculation( 2504 mFirstBuffer, mFirstBuffer->getStart(), mTokenizer->getLineNumber(), 2505 mTokenizer->getColumnNumber(), mTreeBuilder->newSnapshot()); 2506 mTreeBuilder->AddSnapshotToScript(speculation->GetSnapshot(), 2507 speculation->GetStartLineNumber()); 2508 if (mLookingForMetaCharset) { 2509 if (mMode == VIEW_SOURCE_HTML) { 2510 auto r = mTokenizer->FlushViewSource(); 2511 if (r.isErr()) { 2512 MarkAsBroken(r.unwrapErr()); 2513 return; 2514 } 2515 } 2516 auto r = mTreeBuilder->Flush(); 2517 if (r.isErr()) { 2518 MarkAsBroken(r.unwrapErr()); 2519 return; 2520 } 2521 } else { 2522 FlushTreeOpsAndDisarmTimer(); 2523 } 2524 mTreeBuilder->SetOpSink(speculation); 2525 mSpeculations.AppendElement(speculation); // adopts the pointer 2526 mSpeculating = true; 2527 } 2528 if (IsTerminatedOrInterrupted()) { 2529 return; 2530 } 2531 } 2532 if (mLookingForMetaCharset) { 2533 (void)ProcessLookingForMetaCharset(false); 2534 } 2535 } 2536 } 2537 2538 class nsHtml5StreamParserContinuation : public Runnable { 2539 private: 2540 nsHtml5StreamParserPtr mStreamParser; 2541 2542 public: 2543 explicit nsHtml5StreamParserContinuation(nsHtml5StreamParser* aStreamParser) 2544 : Runnable("nsHtml5StreamParserContinuation"), 2545 mStreamParser(aStreamParser) {} 2546 NS_IMETHOD Run() override { 2547 mozilla::MutexAutoLock autoLock(mStreamParser->mTokenizerMutex); 2548 mStreamParser->Uninterrupt(); 2549 mStreamParser->ParseAvailableData(); 2550 return NS_OK; 2551 } 2552 }; 2553 2554 void nsHtml5StreamParser::ContinueAfterScriptsOrEncodingCommitment( 2555 nsHtml5Tokenizer* aTokenizer, nsHtml5TreeBuilder* aTreeBuilder, 2556 bool aLastWasCR) { 2557 // nullptr for aTokenizer means encoding commitment as opposed to the "after 2558 // scripts" case. 2559 2560 MOZ_ASSERT(NS_IsMainThread(), "Wrong thread!"); 2561 MOZ_ASSERT(mMode != VIEW_SOURCE_XML, 2562 "ContinueAfterScriptsOrEncodingCommitment called in XML view " 2563 "source mode!"); 2564 MOZ_ASSERT(!(aTokenizer && mMode == VIEW_SOURCE_HTML), 2565 "ContinueAfterScriptsOrEncodingCommitment called with non-null " 2566 "tokenizer in HTML view " 2567 "source mode."); 2568 if (NS_FAILED(mExecutor->IsBroken())) { 2569 return; 2570 } 2571 MOZ_ASSERT(!(aTokenizer && mMode != NORMAL), 2572 "We should only be executing scripts in the normal mode."); 2573 if (!aTokenizer && (mMode == PLAIN_TEXT || mMode == VIEW_SOURCE_PLAIN || 2574 mMode == VIEW_SOURCE_HTML)) { 2575 // Take the ops that were generated from OnStartRequest for the synthetic 2576 // head section of the document for plain text and HTML View Source. 2577 // XML View Source never needs this kind of encoding commitment. 2578 // We need to take the ops here so that they end up in the queue before 2579 // the ops that we take from a speculation later in this method. 2580 if (!mExecutor->TakeOpsFromStage()) { 2581 mExecutor->MarkAsBroken(NS_ERROR_OUT_OF_MEMORY); 2582 return; 2583 } 2584 } else { 2585 #ifdef DEBUG 2586 mExecutor->AssertStageEmpty(); 2587 #endif 2588 } 2589 bool speculationFailed = false; 2590 { 2591 mozilla::MutexAutoLock speculationAutoLock(mSpeculationMutex); 2592 if (mSpeculations.IsEmpty()) { 2593 MOZ_ASSERT_UNREACHABLE( 2594 "ContinueAfterScriptsOrEncodingCommitment called without " 2595 "speculations."); 2596 return; 2597 } 2598 2599 const auto& speculation = mSpeculations.ElementAt(0); 2600 if (aTokenizer && 2601 (aLastWasCR || !aTokenizer->isInDataState() || 2602 !aTreeBuilder->snapshotMatches(speculation->GetSnapshot()))) { 2603 speculationFailed = true; 2604 // We've got a failed speculation :-( 2605 MaybeDisableFutureSpeculation(); 2606 Interrupt(); // Make the parser thread release the tokenizer mutex sooner 2607 // Note that the interrupted state continues across possible intervening 2608 // Necko events until the nsHtml5StreamParserContinuation posted at the 2609 // end of this method runs. Therefore, this thread is guaranteed to 2610 // acquire mTokenizerMutex soon even if an intervening Necko event grabbed 2611 // it between now and the acquisition below. 2612 2613 // now fall out of the speculationAutoLock into the tokenizerAutoLock 2614 // block 2615 } else { 2616 // We've got a successful speculation! 2617 if (mSpeculations.Length() > 1) { 2618 // the first speculation isn't the current speculation, so there's 2619 // no need to bother the parser thread. 2620 if (!speculation->FlushToSink(mExecutor)) { 2621 mExecutor->MarkAsBroken(NS_ERROR_OUT_OF_MEMORY); 2622 return; 2623 } 2624 MOZ_ASSERT(!mExecutor->IsScriptExecuting(), 2625 "ParseUntilBlocked() was supposed to ensure we don't come " 2626 "here when scripts are executing."); 2627 MOZ_ASSERT(!aTokenizer || mExecutor->IsInFlushLoop(), 2628 "How are we here if " 2629 "RunFlushLoop() didn't call ParseUntilBlocked() or we're " 2630 "not committing to an encoding?"); 2631 mSpeculations.RemoveElementAt(0); 2632 return; 2633 } 2634 // else 2635 Interrupt(); // Make the parser thread release the tokenizer mutex sooner 2636 // Note that the interrupted state continues across possible intervening 2637 // Necko events until the nsHtml5StreamParserContinuation posted at the 2638 // end of this method runs. Therefore, this thread is guaranteed to 2639 // acquire mTokenizerMutex soon even if an intervening Necko event grabbed 2640 // it between now and the acquisition below. 2641 2642 // now fall through 2643 // the first speculation is the current speculation. Need to 2644 // release the the speculation mutex and acquire the tokenizer 2645 // mutex. (Just acquiring the other mutex here would deadlock) 2646 } 2647 } 2648 { 2649 mozilla::MutexAutoLock tokenizerAutoLock(mTokenizerMutex); 2650 #ifdef DEBUG 2651 { 2652 mAtomTable.SetPermittedLookupEventTarget( 2653 GetMainThreadSerialEventTarget()); 2654 } 2655 #endif 2656 // In principle, the speculation mutex should be acquired here, 2657 // but there's no point, because the parser thread only acquires it 2658 // when it has also acquired the tokenizer mutex and we are already 2659 // holding the tokenizer mutex. 2660 if (speculationFailed) { 2661 MOZ_ASSERT(mMode == NORMAL); 2662 // Rewind the stream 2663 mAtEOF = false; 2664 const auto& speculation = mSpeculations.ElementAt(0); 2665 mFirstBuffer = speculation->GetBuffer(); 2666 mFirstBuffer->setStart(speculation->GetStart()); 2667 mTokenizer->setLineNumber(speculation->GetStartLineNumber()); 2668 mTokenizer->setColumnNumberAndResetNextLine( 2669 speculation->GetStartColumnNumber()); 2670 2671 nsContentUtils::ReportToConsole( 2672 nsIScriptError::warningFlag, "DOM Events"_ns, 2673 mExecutor->GetDocument(), nsContentUtils::eDOM_PROPERTIES, 2674 "SpeculationFailed2", nsTArray<nsString>(), 2675 SourceLocation(mExecutor->GetDocument()->GetDocumentURI(), 2676 speculation->GetStartLineNumber(), 2677 speculation->GetStartColumnNumber())); 2678 2679 nsHtml5OwningUTF16Buffer* buffer = mFirstBuffer->next; 2680 while (buffer) { 2681 buffer->setStart(0); 2682 buffer = buffer->next; 2683 } 2684 2685 mSpeculations.Clear(); // potentially a huge number of destructors 2686 // run here synchronously on the main thread... 2687 2688 mTreeBuilder->flushCharacters(); // empty the pending buffer 2689 mTreeBuilder->ClearOps(); // now get rid of the failed ops 2690 2691 mTreeBuilder->SetOpSink(mExecutor->GetStage()); 2692 mExecutor->StartReadingFromStage(); 2693 mSpeculating = false; 2694 2695 // Copy state over 2696 mLastWasCR = aLastWasCR; 2697 mTokenizer->loadState(aTokenizer); 2698 mTreeBuilder->loadState(aTreeBuilder); 2699 } else { 2700 // We've got a successful speculation and at least a moment ago it was 2701 // the current speculation 2702 if (!mSpeculations.ElementAt(0)->FlushToSink(mExecutor)) { 2703 mExecutor->MarkAsBroken(NS_ERROR_OUT_OF_MEMORY); 2704 return; 2705 } 2706 MOZ_ASSERT(!mExecutor->IsScriptExecuting(), 2707 "ParseUntilBlocked() was supposed to ensure we don't come " 2708 "here when scripts are executing."); 2709 MOZ_ASSERT(!aTokenizer || mExecutor->IsInFlushLoop(), 2710 "How are we here if " 2711 "RunFlushLoop() didn't call ParseUntilBlocked() or we're not " 2712 "committing to an encoding?"); 2713 mSpeculations.RemoveElementAt(0); 2714 if (mSpeculations.IsEmpty()) { 2715 if (mMode == VIEW_SOURCE_HTML) { 2716 // If we looked for meta charset in the HTML View Source case. 2717 mTokenizer->SetViewSourceOpSink(mExecutor->GetStage()); 2718 } else { 2719 // yes, it was still the only speculation. Now stop speculating 2720 // However, before telling the executor to read from stage, flush 2721 // any pending ops straight to the executor, because otherwise 2722 // they remain unflushed until we get more data from the network. 2723 mTreeBuilder->SetOpSink(mExecutor); 2724 auto r = mTreeBuilder->Flush(true); 2725 if (r.isErr()) { 2726 mExecutor->MarkAsBroken(r.unwrapErr()); 2727 return; 2728 } 2729 mTreeBuilder->SetOpSink(mExecutor->GetStage()); 2730 } 2731 mExecutor->StartReadingFromStage(); 2732 mSpeculating = false; 2733 } 2734 } 2735 nsCOMPtr<nsIRunnable> event = new nsHtml5StreamParserContinuation(this); 2736 if (NS_FAILED(mEventTarget->Dispatch(event, nsIThread::DISPATCH_NORMAL))) { 2737 NS_WARNING("Failed to dispatch nsHtml5StreamParserContinuation"); 2738 } 2739 // A stream event might run before this event runs, but that's harmless. 2740 #ifdef DEBUG 2741 mAtomTable.SetPermittedLookupEventTarget(mEventTarget); 2742 #endif 2743 } 2744 } 2745 2746 void nsHtml5StreamParser::ContinueAfterFailedCharsetSwitch() { 2747 MOZ_ASSERT(NS_IsMainThread(), "Wrong thread!"); 2748 nsCOMPtr<nsIRunnable> event = new nsHtml5StreamParserContinuation(this); 2749 if (NS_FAILED(mEventTarget->Dispatch(event, nsIThread::DISPATCH_NORMAL))) { 2750 NS_WARNING("Failed to dispatch nsHtml5StreamParserContinuation"); 2751 } 2752 } 2753 2754 class nsHtml5TimerKungFu : public Runnable { 2755 private: 2756 nsHtml5StreamParserPtr mStreamParser; 2757 2758 public: 2759 explicit nsHtml5TimerKungFu(nsHtml5StreamParser* aStreamParser) 2760 : Runnable("nsHtml5TimerKungFu"), mStreamParser(aStreamParser) {} 2761 NS_IMETHOD Run() override { 2762 mozilla::MutexAutoLock flushTimerLock(mStreamParser->mFlushTimerMutex); 2763 if (mStreamParser->mFlushTimer) { 2764 mStreamParser->mFlushTimer->Cancel(); 2765 mStreamParser->mFlushTimer = nullptr; 2766 } 2767 return NS_OK; 2768 } 2769 }; 2770 2771 void nsHtml5StreamParser::DropTimer() { 2772 MOZ_ASSERT(NS_IsMainThread(), "Wrong thread!"); 2773 /* 2774 * Simply nulling out the timer wouldn't work, because if the timer is 2775 * armed, it needs to be canceled first. Simply canceling it first wouldn't 2776 * work, because nsTimerImpl::Cancel is not safe for calling from outside 2777 * the thread where nsTimerImpl::Fire would run. It's not safe to 2778 * dispatch a runnable to cancel the timer from the destructor of this 2779 * class, because the timer has a weak (void*) pointer back to this instance 2780 * of the stream parser and having the timer fire before the runnable 2781 * cancels it would make the timer access a deleted object. 2782 * 2783 * This DropTimer method addresses these issues. This method must be called 2784 * on the main thread before the destructor of this class is reached. 2785 * The nsHtml5TimerKungFu object has an nsHtml5StreamParserPtr that addrefs 2786 * this 2787 * stream parser object to keep it alive until the runnable is done. 2788 * The runnable cancels the timer on the parser thread, drops the timer 2789 * and lets nsHtml5StreamParserPtr send a runnable back to the main thread to 2790 * release the stream parser. 2791 */ 2792 mozilla::MutexAutoLock flushTimerLock(mFlushTimerMutex); 2793 if (mFlushTimer) { 2794 nsCOMPtr<nsIRunnable> event = new nsHtml5TimerKungFu(this); 2795 if (NS_FAILED(mEventTarget->Dispatch(event, nsIThread::DISPATCH_NORMAL))) { 2796 NS_WARNING("Failed to dispatch TimerKungFu event"); 2797 } 2798 } 2799 } 2800 2801 // Using a static, because the method name Notify is taken by the chardet 2802 // callback. 2803 void nsHtml5StreamParser::TimerCallback(nsITimer* aTimer, void* aClosure) { 2804 (static_cast<nsHtml5StreamParser*>(aClosure))->TimerFlush(); 2805 } 2806 2807 void nsHtml5StreamParser::TimerFlush() { 2808 MOZ_ASSERT(IsParserThread(), "Wrong thread!"); 2809 mozilla::MutexAutoLock autoLock(mTokenizerMutex); 2810 2811 MOZ_ASSERT(!mSpeculating, "Flush timer fired while speculating."); 2812 2813 // The timer fired if we got here. No need to cancel it. Mark it as 2814 // not armed, though. 2815 mFlushTimerArmed = false; 2816 2817 mFlushTimerEverFired = true; 2818 2819 if (IsTerminatedOrInterrupted()) { 2820 return; 2821 } 2822 2823 if (mMode == VIEW_SOURCE_HTML || mMode == VIEW_SOURCE_XML) { 2824 auto r = mTreeBuilder->Flush(); // delete useless ops 2825 if (r.isErr()) { 2826 MarkAsBroken(r.unwrapErr()); 2827 return; 2828 } 2829 r = mTokenizer->FlushViewSource(); 2830 if (r.isErr()) { 2831 MarkAsBroken(r.unwrapErr()); 2832 return; 2833 } 2834 if (r.unwrap()) { 2835 nsCOMPtr<nsIRunnable> runnable(mExecutorFlusher); 2836 if (NS_FAILED(DispatchToMain(runnable.forget()))) { 2837 NS_WARNING("failed to dispatch executor flush event"); 2838 } 2839 } 2840 } else { 2841 // we aren't speculating and we don't know when new data is 2842 // going to arrive. Send data to the main thread. 2843 auto r = mTreeBuilder->Flush(true); 2844 if (r.isErr()) { 2845 MarkAsBroken(r.unwrapErr()); 2846 return; 2847 } 2848 if (r.unwrap()) { 2849 nsCOMPtr<nsIRunnable> runnable(mExecutorFlusher); 2850 if (NS_FAILED(DispatchToMain(runnable.forget()))) { 2851 NS_WARNING("failed to dispatch executor flush event"); 2852 } 2853 } 2854 } 2855 } 2856 2857 void nsHtml5StreamParser::MarkAsBroken(nsresult aRv) { 2858 MOZ_ASSERT(IsParserThread(), "Wrong thread!"); 2859 mTokenizerMutex.AssertCurrentThreadOwns(); 2860 2861 Terminate(); 2862 mTreeBuilder->MarkAsBroken(aRv); 2863 auto r = mTreeBuilder->Flush(false); 2864 if (r.isOk()) { 2865 MOZ_ASSERT(r.unwrap(), "Should have had the markAsBroken op!"); 2866 } else { 2867 MOZ_CRASH("OOM prevents propagation of OOM state"); 2868 } 2869 nsCOMPtr<nsIRunnable> runnable(mExecutorFlusher); 2870 if (NS_FAILED(DispatchToMain(runnable.forget()))) { 2871 NS_WARNING("failed to dispatch executor flush event"); 2872 } 2873 } 2874 2875 nsresult nsHtml5StreamParser::DispatchToMain( 2876 already_AddRefed<nsIRunnable>&& aRunnable) { 2877 return SchedulerGroup::Dispatch(std::move(aRunnable)); 2878 }