tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

commit 3e28bf17eedf93851c338e8b024b98849e7468f4
parent 7091ef6896d27d04fc09c211d731ecfaead681c2
Author: Greg Tatum <tatum.creative@gmail.com>
Date:   Mon,  8 Dec 2025 15:34:00 +0000

Bug 1999038 - Add headless extraction to page extractor r=ai-ondevice-reviewers,tarek

Differential Revision: https://phabricator.services.mozilla.com/D271839

Diffstat:
Mtoolkit/components/pageextractor/PageExtractorChild.sys.mjs | 54+++++++++++++++++++++++++++++++++++++-----------------
Mtoolkit/components/pageextractor/PageExtractorParent.sys.mjs | 126+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtoolkit/components/pageextractor/tests/browser/browser.toml | 2++
Atoolkit/components/pageextractor/tests/browser/browser_headless_extractor.js | 105+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtoolkit/components/pageextractor/tests/browser/head.js | 15+++++++++------
Mtoolkit/modules/ActorManagerParent.sys.mjs | 5+----
6 files changed, 280 insertions(+), 27 deletions(-)

diff --git a/toolkit/components/pageextractor/PageExtractorChild.sys.mjs b/toolkit/components/pageextractor/PageExtractorChild.sys.mjs @@ -9,27 +9,23 @@ * @import { PageExtractorParent } from './PageExtractorParent.sys.mjs' */ -/* eslint-disable jsdoc/require-property-description */ - /** - * @typedef {object} Lazy - * @property {typeof console} console - * @property {typeof import("resource://gre/modules/Readerable.sys.mjs").isProbablyReaderable} isProbablyReaderable - * @property {typeof import("moz-src:///toolkit/components/reader/ReaderMode.sys.mjs").ReaderMode} ReaderMode - * @property {typeof import("./DOMExtractor.sys.mjs").extractTextFromDOM} extractTextFromDOM + * We wait for the page to be ready before extracting content headlessly. It's hard + * to know when a page is "ready", however the strategy here is to wait for + * DOMContentLoaded, and then a requestIdleCallback. This way the page has time + * to do an initial amount of work. However, if we wait too long, it will be felt by + * the user as lag. To mitigate this, wait for at least 2 seconds for the page to settle. */ +const MAX_REQUEST_IDLE_CALLBACK_DELAY_MS = 2000; -/** @type {Lazy} */ -const lazy = /** @type {any} */ ({}); - -ChromeUtils.defineLazyGetter(lazy, "console", () => { - return console.createInstance({ - prefix: "PageExtractorChild", - maxLogLevelPref: "browser.ml.logLevel", - }); -}); +import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs"; -ChromeUtils.defineESModuleGetters(lazy, { +const lazy = XPCOMUtils.declareLazy({ + console: () => + console.createInstance({ + prefix: "PageExtractorChild", + maxLogLevelPref: "browser.ml.logLevel", + }), ReaderMode: "moz-src:///toolkit/components/reader/ReaderMode.sys.mjs", extractTextFromDOM: "moz-src:///toolkit/components/pageextractor/DOMExtractor.sys.mjs", @@ -61,11 +57,35 @@ export class PageExtractorChild extends JSWindowActorChild { return this.getAboutReaderContent(); } return this.getText(data); + case "PageExtractorParent:WaitForPageReady": + return this.waitForPageReady(); } return Promise.reject(new Error("Unknown message: " + name)); } /** + * This function resolves once the page is ready after a requestIdleCallback. + * + * @returns {Promise<void>} + */ + async waitForPageReady() { + return new Promise(resolve => { + const waitForIdle = () => { + this.document.ownerGlobal.requestIdleCallback(() => resolve(), { + timeout: MAX_REQUEST_IDLE_CALLBACK_DELAY_MS, + }); + }; + + if (this.document.readyState == "loading") { + this.document.addEventListener("DOMContentLoaded", waitForIdle); + } else { + lazy.console.log("The page is already interactive"); + waitForIdle(); + } + }); + } + + /** * @see PageExtractorParent#getReaderModeContent for docs * * @param {boolean} force diff --git a/toolkit/components/pageextractor/PageExtractorParent.sys.mjs b/toolkit/components/pageextractor/PageExtractorParent.sys.mjs @@ -5,10 +5,22 @@ // @ts-check /** + * @import { HiddenFrame } from "resource://gre/modules/HiddenFrame.sys.mjs" * @import { GetTextOptions } from './PageExtractor.d.ts' * @import { PageExtractorChild } from './PageExtractorChild.sys.mjs' */ +import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs"; + +const lazy = XPCOMUtils.declareLazy({ + HiddenBrowserManager: "resource://gre/modules/HiddenFrame.sys.mjs", + console: () => + console.createInstance({ + prefix: "PageExtractorChild", + maxLogLevelPref: "browser.ml.logLevel", + }), +}); + /** * Extract a variety of content from pages for use in a smart window. */ @@ -28,6 +40,16 @@ export class PageExtractorParent extends JSWindowActorParent { } /** + * Waits for DOMContentLoaded. + * + * @see PageExtractorChild#waitForPageReady + * @returns {Promise<void>} + */ + waitForPageReady() { + return this.sendQuery("PageExtractorParent:WaitForPageReady"); + } + + /** * Gets the visible text from the page. This function is a bit smarter than just * document.body.innerText. See GetTextOptions * @@ -51,4 +73,108 @@ export class PageExtractorParent extends JSWindowActorParent { .originNoSuffix == "resource://pdf.js" ); } + + /** + * Get a Headless PageExtractor. It is available until the callback's returned + * Promise is resolved. Then the headless browser is cleaned up. + * + * @see PageExtractorChild#getText + * + * @template T - The value resolved in the callback. + * + * @param {string} url + * @param {(actor: PageExtractorParent) => Promise<T>} callback + * @returns {Promise<T>} + */ + static async getHeadlessExtractor(url, callback) { + // The hidden browser manager controls the lifetime of the hidden browser. + return lazy.HiddenBrowserManager.withHiddenBrowser(async browser => { + const { host } = new URL(url); + // Create a custom message manager group for this browser so that the PageExtractor + // actor can communicate with it. The actor is registered to use this custom + // message manager group. + browser.setAttribute("messagemanagergroup", "headless-browsers"); + if (url.startsWith("about:")) { + throw new Error("about: pages are not supported."); + } + if (url.startsWith("file:")) { + throw new Error("file: pages are not supported."); + } + + /** @type {PromiseWithResolvers<PageExtractorParent>} */ + let actorResolver = Promise.withResolvers(); + + const locationChangeFlags = Ci.nsIWebProgress.NOTIFY_LOCATION; + const onLocationChange = { + QueryInterface: ChromeUtils.generateQI([ + "nsIWebProgressListener", + "nsISupportsWeakReference", + ]), + /** + * @param {nsIWebProgress} webProgress + * @param {nsIRequest} _request + * @param {nsIURI} location + * @param {number} _flags + */ + onLocationChange(webProgress, _request, location, _flags) { + if (!webProgress.isTopLevel) { + lazy.console.log( + "Headless browser had a non-top level location change." + ); + return; + } + if (location.spec == "about:blank") { + // about:blank is loaded first before loading the actual page. + return; + } + if (location.hostPort != host) { + lazy.console.log( + "A location change happened that wasn't the host.", + location.host, + host + ); + // This is probably overkill, but make sure this is not a spurious + // redirect. + return; + } + browser.removeProgressListener(onLocationChange, locationChangeFlags); + + /** @type {any} - This is reported as an `Element`, but it's a <browser> */ + const topBrowser = webProgress.browsingContext.topFrameElement; + + try { + const actor = + topBrowser.browsingContext.currentWindowGlobal.getActor( + "PageExtractor" + ); + + actor.waitForPageReady().then(() => { + lazy.console.log("Headless PageExtractor is ready", url); + actorResolver.resolve(actor); + }); + } catch (error) { + // TODO (Bug 2001385) - It would be nice to catch if this is the + // `about:neterror` page or other similar errors. This will also fail if you + // try to access something like `about:reader` with the same error. + actorResolver.reject( + new Error( + "PageExtractor could not run on that page or the page could not be found." + ) + ); + } + }, + }; + + browser.addProgressListener(onLocationChange, locationChangeFlags); + + lazy.console.log("Loading a headless PageExtractor", url); + + browser.fixupAndLoadURIString(url, { + triggeringPrincipal: + Services.scriptSecurityManager.getSystemPrincipal(), + }); + + return callback(await actorResolver.promise); + }); + } } diff --git a/toolkit/components/pageextractor/tests/browser/browser.toml b/toolkit/components/pageextractor/tests/browser/browser.toml @@ -15,3 +15,5 @@ support-files = [ skip-if = [ "os == 'mac' && os_version == '15.30' && arch == 'aarch64' && opt", # Bug 1996139 ] + +["browser_headless_extractor.js"] diff --git a/toolkit/components/pageextractor/tests/browser/browser_headless_extractor.js b/toolkit/components/pageextractor/tests/browser/browser_headless_extractor.js @@ -0,0 +1,105 @@ +/* Any copyright is dedicated to the Public Domain. + https://creativecommons.org/publicdomain/zero/1.0/ */ + +"use strict"; + +/** + * Tests basic headless content extraction. The page is loaded in the background and + * the content is extracted. + */ +add_task(async function test_headless_extraction() { + const { PageExtractorParent } = ChromeUtils.importESModule( + "resource://gre/actors/PageExtractorParent.sys.mjs" + ); + const { url, serverClosed } = serveOnce(` + <!DOCTYPE html> + <html> + <head> + <meta charset="utf-8" /> + <title>Headless Document</title> + </head> + <body> + <div>This is a headless document</div> + </body> + </html> + `); + + const text = await PageExtractorParent.getHeadlessExtractor( + url, + async pageExtractor => pageExtractor.getText() + ); + + is(text, "This is a headless document", "The page's content is extracted"); + + await serverClosed; +}); + +/** + * Test what happens on a 404 page. + */ +add_task(async function test_headless_extraction_404() { + const { PageExtractorParent } = ChromeUtils.importESModule( + "resource://gre/actors/PageExtractorParent.sys.mjs" + ); + const { url, serverClosed } = serveOnce( + ` + <!DOCTYPE html> + <html> + <head> + <meta charset="utf-8" /> + <title>404 not found</title> + </head> + <body> + <div>404 page not found.</div> + </body> + </html> + `, + 404 + ); + + const text = await PageExtractorParent.getHeadlessExtractor( + url, + async pageExtractor => pageExtractor.getText() + ); + + is( + text, + "404 page not found.", + "The page's content is extracted even if it's a 404" + ); + + await serverClosed; +}); + +/** + * Test page extraction on a restricted page. + */ +add_task(async function test_headless_extraction_about_blank() { + const { PageExtractorParent } = ChromeUtils.importESModule( + "resource://gre/actors/PageExtractorParent.sys.mjs" + ); + + await Assert.rejects( + PageExtractorParent.getHeadlessExtractor("about:blank", () => {}), + /about: pages are not supported/, + "PageExtractor fails on about: pages." + ); +}); + +/** + * Test page extraction on a file URL. + */ +add_task(async function test_headless_extraction_about_blank() { + const { PageExtractorParent } = ChromeUtils.importESModule( + "resource://gre/actors/PageExtractorParent.sys.mjs" + ); + + await Assert.rejects( + PageExtractorParent.getHeadlessExtractor( + "file:///NeverGonnaGiveYouUp.mp4", + () => {} + ), + /file: pages are not supported/, + "PageExtractor fails on file: URLs." + ); +}); diff --git a/toolkit/components/pageextractor/tests/browser/head.js b/toolkit/components/pageextractor/tests/browser/head.js @@ -4,6 +4,11 @@ const BLANK_PAGE = "data:text/html;charset=utf-8,<!DOCTYPE html><title>Blank</title>Blank page"; +/** @type {import("../../../../../netwerk/test/httpserver/httpd.sys.mjs")} */ +const { HttpServer } = ChromeUtils.importESModule( + "resource://testing-common/httpd.sys.mjs" +); + /** * Use a tagged template literal to create a page extraction actor test. This spins * up an http server that serves the markup in a new tab. The page extractor can then @@ -66,20 +71,18 @@ async function html(strings, ...values) { * Start an HTTP server that serves page.html with the provided HTML. * * @param {string} html + * @param {number} statusCode */ -function serveOnce(html) { - /** @type {import("../../../../../netwerk/test/httpserver/httpd.sys.mjs")} */ - const { HttpServer } = ChromeUtils.importESModule( - "resource://testing-common/httpd.sys.mjs" - ); +function serveOnce(html, statusCode = 200) { info("Create server"); const server = new HttpServer(); const { promise, resolve } = Promise.withResolvers(); - server.registerPathHandler("/page.html", (_request, response) => { + server.registerPathHandler("/page.html", (request, response) => { info("Request received for: " + url); response.setHeader("Content-Type", "text/html"); + response.setStatusLine(request.httpVersion, statusCode); response.write(html); resolve(server.stop()); }); diff --git a/toolkit/modules/ActorManagerParent.sys.mjs b/toolkit/modules/ActorManagerParent.sys.mjs @@ -472,9 +472,6 @@ let JSWINDOWACTORS = { }, child: { esModuleURI: "resource://gre/actors/PageExtractorChild.sys.mjs", - events: { - DOMContentLoaded: { createActor: false }, - }, }, matches: [ "http://*/*", @@ -484,7 +481,7 @@ let JSWINDOWACTORS = { "data:text/html,*", "about:reader?*", ], - messageManagerGroups: ["browsers"], + messageManagerGroups: ["browsers", "headless-browsers"], }, PopupAndRedirectBlocking: {