commit 3e28bf17eedf93851c338e8b024b98849e7468f4
parent 7091ef6896d27d04fc09c211d731ecfaead681c2
Author: Greg Tatum <tatum.creative@gmail.com>
Date: Mon, 8 Dec 2025 15:34:00 +0000
Bug 1999038 - Add headless extraction to page extractor r=ai-ondevice-reviewers,tarek
Differential Revision: https://phabricator.services.mozilla.com/D271839
Diffstat:
6 files changed, 280 insertions(+), 27 deletions(-)
diff --git a/toolkit/components/pageextractor/PageExtractorChild.sys.mjs b/toolkit/components/pageextractor/PageExtractorChild.sys.mjs
@@ -9,27 +9,23 @@
* @import { PageExtractorParent } from './PageExtractorParent.sys.mjs'
*/
-/* eslint-disable jsdoc/require-property-description */
-
/**
- * @typedef {object} Lazy
- * @property {typeof console} console
- * @property {typeof import("resource://gre/modules/Readerable.sys.mjs").isProbablyReaderable} isProbablyReaderable
- * @property {typeof import("moz-src:///toolkit/components/reader/ReaderMode.sys.mjs").ReaderMode} ReaderMode
- * @property {typeof import("./DOMExtractor.sys.mjs").extractTextFromDOM} extractTextFromDOM
+ * We wait for the page to be ready before extracting content headlessly. It's hard
+ * to know when a page is "ready", however the strategy here is to wait for
+ * DOMContentLoaded, and then a requestIdleCallback. This way the page has time
+ * to do an initial amount of work. However, if we wait too long, it will be felt by
+ * the user as lag. To mitigate this, wait for at least 2 seconds for the page to settle.
*/
+const MAX_REQUEST_IDLE_CALLBACK_DELAY_MS = 2000;
-/** @type {Lazy} */
-const lazy = /** @type {any} */ ({});
-
-ChromeUtils.defineLazyGetter(lazy, "console", () => {
- return console.createInstance({
- prefix: "PageExtractorChild",
- maxLogLevelPref: "browser.ml.logLevel",
- });
-});
+import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs";
-ChromeUtils.defineESModuleGetters(lazy, {
+const lazy = XPCOMUtils.declareLazy({
+ console: () =>
+ console.createInstance({
+ prefix: "PageExtractorChild",
+ maxLogLevelPref: "browser.ml.logLevel",
+ }),
ReaderMode: "moz-src:///toolkit/components/reader/ReaderMode.sys.mjs",
extractTextFromDOM:
"moz-src:///toolkit/components/pageextractor/DOMExtractor.sys.mjs",
@@ -61,11 +57,35 @@ export class PageExtractorChild extends JSWindowActorChild {
return this.getAboutReaderContent();
}
return this.getText(data);
+ case "PageExtractorParent:WaitForPageReady":
+ return this.waitForPageReady();
}
return Promise.reject(new Error("Unknown message: " + name));
}
/**
+ * This function resolves once the page is ready after a requestIdleCallback.
+ *
+ * @returns {Promise<void>}
+ */
+ async waitForPageReady() {
+ return new Promise(resolve => {
+ const waitForIdle = () => {
+ this.document.ownerGlobal.requestIdleCallback(() => resolve(), {
+ timeout: MAX_REQUEST_IDLE_CALLBACK_DELAY_MS,
+ });
+ };
+
+ if (this.document.readyState == "loading") {
+ this.document.addEventListener("DOMContentLoaded", waitForIdle);
+ } else {
+ lazy.console.log("The page is already interactive");
+ waitForIdle();
+ }
+ });
+ }
+
+ /**
* @see PageExtractorParent#getReaderModeContent for docs
*
* @param {boolean} force
diff --git a/toolkit/components/pageextractor/PageExtractorParent.sys.mjs b/toolkit/components/pageextractor/PageExtractorParent.sys.mjs
@@ -5,10 +5,22 @@
// @ts-check
/**
+ * @import { HiddenFrame } from "resource://gre/modules/HiddenFrame.sys.mjs"
* @import { GetTextOptions } from './PageExtractor.d.ts'
* @import { PageExtractorChild } from './PageExtractorChild.sys.mjs'
*/
+import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs";
+
+const lazy = XPCOMUtils.declareLazy({
+ HiddenBrowserManager: "resource://gre/modules/HiddenFrame.sys.mjs",
+ console: () =>
+ console.createInstance({
+ prefix: "PageExtractorChild",
+ maxLogLevelPref: "browser.ml.logLevel",
+ }),
+});
+
/**
* Extract a variety of content from pages for use in a smart window.
*/
@@ -28,6 +40,16 @@ export class PageExtractorParent extends JSWindowActorParent {
}
/**
+ * Waits for DOMContentLoaded.
+ *
+ * @see PageExtractorChild#waitForPageReady
+ * @returns {Promise<void>}
+ */
+ waitForPageReady() {
+ return this.sendQuery("PageExtractorParent:WaitForPageReady");
+ }
+
+ /**
* Gets the visible text from the page. This function is a bit smarter than just
* document.body.innerText. See GetTextOptions
*
@@ -51,4 +73,108 @@ export class PageExtractorParent extends JSWindowActorParent {
.originNoSuffix == "resource://pdf.js"
);
}
+
+ /**
+ * Get a Headless PageExtractor. It is available until the callback's returned
+ * Promise is resolved. Then the headless browser is cleaned up.
+ *
+ * @see PageExtractorChild#getText
+ *
+ * @template T - The value resolved in the callback.
+ *
+ * @param {string} url
+ * @param {(actor: PageExtractorParent) => Promise<T>} callback
+ * @returns {Promise<T>}
+ */
+ static async getHeadlessExtractor(url, callback) {
+ // The hidden browser manager controls the lifetime of the hidden browser.
+ return lazy.HiddenBrowserManager.withHiddenBrowser(async browser => {
+ const { host } = new URL(url);
+ // Create a custom message manager group for this browser so that the PageExtractor
+ // actor can communicate with it. The actor is registered to use this custom
+ // message manager group.
+ browser.setAttribute("messagemanagergroup", "headless-browsers");
+ if (url.startsWith("about:")) {
+ throw new Error("about: pages are not supported.");
+ }
+ if (url.startsWith("file:")) {
+ throw new Error("file: pages are not supported.");
+ }
+
+ /** @type {PromiseWithResolvers<PageExtractorParent>} */
+ let actorResolver = Promise.withResolvers();
+
+ const locationChangeFlags = Ci.nsIWebProgress.NOTIFY_LOCATION;
+ const onLocationChange = {
+ QueryInterface: ChromeUtils.generateQI([
+ "nsIWebProgressListener",
+ "nsISupportsWeakReference",
+ ]),
+ /**
+ * @param {nsIWebProgress} webProgress
+ * @param {nsIRequest} _request
+ * @param {nsIURI} location
+ * @param {number} _flags
+ */
+ onLocationChange(webProgress, _request, location, _flags) {
+ if (!webProgress.isTopLevel) {
+ lazy.console.log(
+ "Headless browser had a non-top level location change."
+ );
+ return;
+ }
+ if (location.spec == "about:blank") {
+ // about:blank is loaded first before loading the actual page.
+ return;
+ }
+ if (location.hostPort != host) {
+ lazy.console.log(
+ "A location change happened that wasn't the host.",
+ location.host,
+ host
+ );
+ // This is probably overkill, but make sure this is not a spurious
+ // redirect.
+ return;
+ }
+ browser.removeProgressListener(onLocationChange, locationChangeFlags);
+
+ /** @type {any} - This is reported as an `Element`, but it's a <browser> */
+ const topBrowser = webProgress.browsingContext.topFrameElement;
+
+ try {
+ const actor =
+ topBrowser.browsingContext.currentWindowGlobal.getActor(
+ "PageExtractor"
+ );
+
+ actor.waitForPageReady().then(() => {
+ lazy.console.log("Headless PageExtractor is ready", url);
+ actorResolver.resolve(actor);
+ });
+ } catch (error) {
+ // TODO (Bug 2001385) - It would be nice to catch if this is the
+ // `about:neterror` page or other similar errors. This will also fail if you
+ // try to access something like `about:reader` with the same error.
+ actorResolver.reject(
+ new Error(
+ "PageExtractor could not run on that page or the page could not be found."
+ )
+ );
+ }
+ },
+ };
+
+ browser.addProgressListener(onLocationChange, locationChangeFlags);
+
+ lazy.console.log("Loading a headless PageExtractor", url);
+
+ browser.fixupAndLoadURIString(url, {
+ triggeringPrincipal:
+ Services.scriptSecurityManager.getSystemPrincipal(),
+ });
+
+ return callback(await actorResolver.promise);
+ });
+ }
}
diff --git a/toolkit/components/pageextractor/tests/browser/browser.toml b/toolkit/components/pageextractor/tests/browser/browser.toml
@@ -15,3 +15,5 @@ support-files = [
skip-if = [
"os == 'mac' && os_version == '15.30' && arch == 'aarch64' && opt", # Bug 1996139
]
+
+["browser_headless_extractor.js"]
diff --git a/toolkit/components/pageextractor/tests/browser/browser_headless_extractor.js b/toolkit/components/pageextractor/tests/browser/browser_headless_extractor.js
@@ -0,0 +1,105 @@
+/* Any copyright is dedicated to the Public Domain.
+ https://creativecommons.org/publicdomain/zero/1.0/ */
+
+"use strict";
+
+/**
+ * Tests basic headless content extraction. The page is loaded in the background and
+ * the content is extracted.
+ */
+add_task(async function test_headless_extraction() {
+ const { PageExtractorParent } = ChromeUtils.importESModule(
+ "resource://gre/actors/PageExtractorParent.sys.mjs"
+ );
+ const { url, serverClosed } = serveOnce(`
+ <!DOCTYPE html>
+ <html>
+ <head>
+ <meta charset="utf-8" />
+ <title>Headless Document</title>
+ </head>
+ <body>
+ <div>This is a headless document</div>
+ </body>
+ </html>
+ `);
+
+ const text = await PageExtractorParent.getHeadlessExtractor(
+ url,
+ async pageExtractor => pageExtractor.getText()
+ );
+
+ is(text, "This is a headless document", "The page's content is extracted");
+
+ await serverClosed;
+});
+
+/**
+ * Test what happens on a 404 page.
+ */
+add_task(async function test_headless_extraction_404() {
+ const { PageExtractorParent } = ChromeUtils.importESModule(
+ "resource://gre/actors/PageExtractorParent.sys.mjs"
+ );
+ const { url, serverClosed } = serveOnce(
+ `
+ <!DOCTYPE html>
+ <html>
+ <head>
+ <meta charset="utf-8" />
+ <title>404 not found</title>
+ </head>
+ <body>
+ <div>404 page not found.</div>
+ </body>
+ </html>
+ `,
+ 404
+ );
+
+ const text = await PageExtractorParent.getHeadlessExtractor(
+ url,
+ async pageExtractor => pageExtractor.getText()
+ );
+
+ is(
+ text,
+ "404 page not found.",
+ "The page's content is extracted even if it's a 404"
+ );
+
+ await serverClosed;
+});
+
+/**
+ * Test page extraction on a restricted page.
+ */
+add_task(async function test_headless_extraction_about_blank() {
+ const { PageExtractorParent } = ChromeUtils.importESModule(
+ "resource://gre/actors/PageExtractorParent.sys.mjs"
+ );
+
+ await Assert.rejects(
+ PageExtractorParent.getHeadlessExtractor("about:blank", () => {}),
+ /about: pages are not supported/,
+ "PageExtractor fails on about: pages."
+ );
+});
+
+/**
+ * Test page extraction on a file URL.
+ */
+add_task(async function test_headless_extraction_about_blank() {
+ const { PageExtractorParent } = ChromeUtils.importESModule(
+ "resource://gre/actors/PageExtractorParent.sys.mjs"
+ );
+
+ await Assert.rejects(
+ PageExtractorParent.getHeadlessExtractor(
+ "file:///NeverGonnaGiveYouUp.mp4",
+ () => {}
+ ),
+ /file: pages are not supported/,
+ "PageExtractor fails on file: URLs."
+ );
+});
diff --git a/toolkit/components/pageextractor/tests/browser/head.js b/toolkit/components/pageextractor/tests/browser/head.js
@@ -4,6 +4,11 @@
const BLANK_PAGE =
"data:text/html;charset=utf-8,<!DOCTYPE html><title>Blank</title>Blank page";
+/** @type {import("../../../../../netwerk/test/httpserver/httpd.sys.mjs")} */
+const { HttpServer } = ChromeUtils.importESModule(
+ "resource://testing-common/httpd.sys.mjs"
+);
+
/**
* Use a tagged template literal to create a page extraction actor test. This spins
* up an http server that serves the markup in a new tab. The page extractor can then
@@ -66,20 +71,18 @@ async function html(strings, ...values) {
* Start an HTTP server that serves page.html with the provided HTML.
*
* @param {string} html
+ * @param {number} statusCode
*/
-function serveOnce(html) {
- /** @type {import("../../../../../netwerk/test/httpserver/httpd.sys.mjs")} */
- const { HttpServer } = ChromeUtils.importESModule(
- "resource://testing-common/httpd.sys.mjs"
- );
+function serveOnce(html, statusCode = 200) {
info("Create server");
const server = new HttpServer();
const { promise, resolve } = Promise.withResolvers();
- server.registerPathHandler("/page.html", (_request, response) => {
+ server.registerPathHandler("/page.html", (request, response) => {
info("Request received for: " + url);
response.setHeader("Content-Type", "text/html");
+ response.setStatusLine(request.httpVersion, statusCode);
response.write(html);
resolve(server.stop());
});
diff --git a/toolkit/modules/ActorManagerParent.sys.mjs b/toolkit/modules/ActorManagerParent.sys.mjs
@@ -472,9 +472,6 @@ let JSWINDOWACTORS = {
},
child: {
esModuleURI: "resource://gre/actors/PageExtractorChild.sys.mjs",
- events: {
- DOMContentLoaded: { createActor: false },
- },
},
matches: [
"http://*/*",
@@ -484,7 +481,7 @@ let JSWINDOWACTORS = {
"data:text/html,*",
"about:reader?*",
],
- messageManagerGroups: ["browsers"],
+ messageManagerGroups: ["browsers", "headless-browsers"],
},
PopupAndRedirectBlocking: {