[ tor-browser ].git.dasho

commit 3e28bf17eedf93851c338e8b024b98849e7468f4
parent 7091ef6896d27d04fc09c211d731ecfaead681c2
Author: Greg Tatum <tatum.creative@gmail.com>
Date:   Mon,  8 Dec 2025 15:34:00 +0000

Bug 1999038 - Add headless extraction to page extractor r=ai-ondevice-reviewers,tarek

Differential Revision: https://phabricator.services.mozilla.com/D271839

Diffstat:
M toolkit/components/pageextractor/PageExtractorChild.sys.mjs  | 54 +++++++++++++++++++++++++++++++++++++-----------------
M toolkit/components/pageextractor/PageExtractorParent.sys.mjs  | 126 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M toolkit/components/pageextractor/tests/browser/browser.toml  | 2 ++
A toolkit/components/pageextractor/tests/browser/browser_headless_extractor.js  | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M toolkit/components/pageextractor/tests/browser/head.js  | 15 +++++++++------
M toolkit/modules/ActorManagerParent.sys.mjs  | 5 +----

6 files changed, 280 insertions(+), 27 deletions(-)
diff --git a/toolkit/components/pageextractor/PageExtractorChild.sys.mjs b/toolkit/components/pageextractor/PageExtractorChild.sys.mjs
@@ -9,27 +9,23 @@
  * @import { PageExtractorParent } from './PageExtractorParent.sys.mjs'
  */
 
-/* eslint-disable jsdoc/require-property-description */
-
 /**
- * @typedef {object} Lazy
- * @property {typeof console} console
- * @property {typeof import("resource://gre/modules/Readerable.sys.mjs").isProbablyReaderable} isProbablyReaderable
- * @property {typeof import("moz-src:///toolkit/components/reader/ReaderMode.sys.mjs").ReaderMode} ReaderMode
- * @property {typeof import("./DOMExtractor.sys.mjs").extractTextFromDOM} extractTextFromDOM
+ * We wait for the page to be ready before extracting content headlessly. It's hard
+ * to know when a page is "ready", however the strategy here is to wait for
+ * DOMContentLoaded, and then a requestIdleCallback. This way the page has time
+ * to do an initial amount of work. However, if we wait too long, it will be felt by
+ * the user as lag. To mitigate this, wait for at least 2 seconds for the page to settle.
  */
+const MAX_REQUEST_IDLE_CALLBACK_DELAY_MS = 2000;
 
-/** @type {Lazy} */
-const lazy = /** @type {any} */ ({});
-
-ChromeUtils.defineLazyGetter(lazy, "console", () => {
-  return console.createInstance({
-    prefix: "PageExtractorChild",
-    maxLogLevelPref: "browser.ml.logLevel",
-  });
-});
+import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs";
 
-ChromeUtils.defineESModuleGetters(lazy, {
+const lazy = XPCOMUtils.declareLazy({
+  console: () =>
+    console.createInstance({
+      prefix: "PageExtractorChild",
+      maxLogLevelPref: "browser.ml.logLevel",
+    }),
   ReaderMode: "moz-src:///toolkit/components/reader/ReaderMode.sys.mjs",
   extractTextFromDOM:
     "moz-src:///toolkit/components/pageextractor/DOMExtractor.sys.mjs",
@@ -61,11 +57,35 @@ export class PageExtractorChild extends JSWindowActorChild {
           return this.getAboutReaderContent();
         }
         return this.getText(data);
+      case "PageExtractorParent:WaitForPageReady":
+        return this.waitForPageReady();
     }
     return Promise.reject(new Error("Unknown message: " + name));
   }
 
   /**
+   * This function resolves once the page is ready after a requestIdleCallback.
+   *
+   * @returns {Promise<void>}
+   */
+  async waitForPageReady() {
+    return new Promise(resolve => {
+      const waitForIdle = () => {
+        this.document.ownerGlobal.requestIdleCallback(() => resolve(), {
+          timeout: MAX_REQUEST_IDLE_CALLBACK_DELAY_MS,
+        });
+      };
+
+      if (this.document.readyState == "loading") {
+        this.document.addEventListener("DOMContentLoaded", waitForIdle);
+      } else {
+        lazy.console.log("The page is already interactive");
+        waitForIdle();
+      }
+    });
+  }
+
+  /**
    * @see PageExtractorParent#getReaderModeContent for docs
    *
    * @param {boolean} force
diff --git a/toolkit/components/pageextractor/PageExtractorParent.sys.mjs b/toolkit/components/pageextractor/PageExtractorParent.sys.mjs
@@ -5,10 +5,22 @@
 // @ts-check
 
 /**
+ * @import { HiddenFrame } from "resource://gre/modules/HiddenFrame.sys.mjs"
  * @import { GetTextOptions } from './PageExtractor.d.ts'
  * @import { PageExtractorChild } from './PageExtractorChild.sys.mjs'
  */
 
+import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs";
+
+const lazy = XPCOMUtils.declareLazy({
+  HiddenBrowserManager: "resource://gre/modules/HiddenFrame.sys.mjs",
+  console: () =>
+    console.createInstance({
+      prefix: "PageExtractorChild",
+      maxLogLevelPref: "browser.ml.logLevel",
+    }),
+});
+
 /**
  * Extract a variety of content from pages for use in a smart window.
  */
@@ -28,6 +40,16 @@ export class PageExtractorParent extends JSWindowActorParent {
   }
 
   /**
+   * Waits for DOMContentLoaded.
+   *
+   * @see PageExtractorChild#waitForPageReady
+   * @returns {Promise<void>}
+   */
+  waitForPageReady() {
+    return this.sendQuery("PageExtractorParent:WaitForPageReady");
+  }
+
+  /**
    * Gets the visible text from the page. This function is a bit smarter than just
    * document.body.innerText. See GetTextOptions
    *
@@ -51,4 +73,108 @@ export class PageExtractorParent extends JSWindowActorParent {
         .originNoSuffix == "resource://pdf.js"
     );
   }
+
+  /**
+   * Get a Headless PageExtractor. It is available until the callback's returned
+   * Promise is resolved. Then the headless browser is cleaned up.
+   *
+   * @see PageExtractorChild#getText
+   *
+   * @template T - The value resolved in the callback.
+   *
+   * @param {string} url
+   * @param {(actor: PageExtractorParent) => Promise<T>} callback
+   * @returns {Promise<T>}
+   */
+  static async getHeadlessExtractor(url, callback) {
+    // The hidden browser manager controls the lifetime of the hidden browser.
+    return lazy.HiddenBrowserManager.withHiddenBrowser(async browser => {
+      const { host } = new URL(url);
+      // Create a custom message manager group for this browser so that the PageExtractor
+      // actor can communicate with it. The actor is registered to use this custom
+      // message manager group.
+      browser.setAttribute("messagemanagergroup", "headless-browsers");
+      if (url.startsWith("about:")) {
+        throw new Error("about: pages are not supported.");
+      }
+      if (url.startsWith("file:")) {
+        throw new Error("file: pages are not supported.");
+      }
+
+      /** @type {PromiseWithResolvers<PageExtractorParent>} */
+      let actorResolver = Promise.withResolvers();
+
+      const locationChangeFlags = Ci.nsIWebProgress.NOTIFY_LOCATION;
+      const onLocationChange = {
+        QueryInterface: ChromeUtils.generateQI([
+          "nsIWebProgressListener",
+          "nsISupportsWeakReference",
+        ]),
+        /**
+         * @param {nsIWebProgress} webProgress
+         * @param {nsIRequest} _request
+         * @param {nsIURI} location
+         * @param {number} _flags
+         */
+        onLocationChange(webProgress, _request, location, _flags) {
+          if (!webProgress.isTopLevel) {
+            lazy.console.log(
+              "Headless browser had a non-top level location change."
+            );
+            return;
+          }
+          if (location.spec == "about:blank") {
+            // about:blank is loaded first before loading the actual page.
+            return;
+          }
+          if (location.hostPort != host) {
+            lazy.console.log(
+              "A location change happened that wasn't the host.",
+              location.host,
+              host
+            );
+            // This is probably overkill, but make sure this is not a spurious
+            // redirect.
+            return;
+          }
+          browser.removeProgressListener(onLocationChange, locationChangeFlags);
+
+          /** @type {any} - This is reported as an `Element`, but it's a <browser> */
+          const topBrowser = webProgress.browsingContext.topFrameElement;
+
+          try {
+            const actor =
+              topBrowser.browsingContext.currentWindowGlobal.getActor(
+                "PageExtractor"
+              );
+
+            actor.waitForPageReady().then(() => {
+              lazy.console.log("Headless PageExtractor is ready", url);
+              actorResolver.resolve(actor);
+            });
+          } catch (error) {
+            // TODO (Bug 2001385) - It would be nice to catch if this is the
+            // `about:neterror` page or other similar errors. This will also fail if you
+            // try to access something like `about:reader` with the same error.
+            actorResolver.reject(
+              new Error(
+                "PageExtractor could not run on that page or the page could not be found."
+              )
+            );
+          }
+        },
+      };
+
+      browser.addProgressListener(onLocationChange, locationChangeFlags);
+
+      lazy.console.log("Loading a headless PageExtractor", url);
+
+      browser.fixupAndLoadURIString(url, {
+        triggeringPrincipal:
+          Services.scriptSecurityManager.getSystemPrincipal(),
+      });
+
+      return callback(await actorResolver.promise);
+    });
+  }
 }
diff --git a/toolkit/components/pageextractor/tests/browser/browser.toml b/toolkit/components/pageextractor/tests/browser/browser.toml
@@ -15,3 +15,5 @@ support-files = [
 skip-if = [
   "os == 'mac' && os_version == '15.30' && arch == 'aarch64' && opt", # Bug 1996139
 ]
+
+["browser_headless_extractor.js"]
diff --git a/toolkit/components/pageextractor/tests/browser/browser_headless_extractor.js b/toolkit/components/pageextractor/tests/browser/browser_headless_extractor.js
@@ -0,0 +1,105 @@
+/* Any copyright is dedicated to the Public Domain.
+   https://creativecommons.org/publicdomain/zero/1.0/ */
+
+"use strict";
+
+/**
+ * Tests basic headless content extraction. The page is loaded in the background and
+ * the content is extracted.
+ */
+add_task(async function test_headless_extraction() {
+  const { PageExtractorParent } = ChromeUtils.importESModule(
+    "resource://gre/actors/PageExtractorParent.sys.mjs"
+  );
+  const { url, serverClosed } = serveOnce(`
+    <!DOCTYPE html>
+    <html>
+      <head>
+        <meta charset="utf-8" />
+        <title>Headless Document</title>
+      </head>
+      <body>
+        <div>This is a headless document</div>
+      </body>
+    </html>
+  `);
+
+  const text = await PageExtractorParent.getHeadlessExtractor(
+    url,
+    async pageExtractor => pageExtractor.getText()
+  );
+
+  is(text, "This is a headless document", "The page's content is extracted");
+
+  await serverClosed;
+});
+
+/**
+ * Test what happens on a 404 page.
+ */
+add_task(async function test_headless_extraction_404() {
+  const { PageExtractorParent } = ChromeUtils.importESModule(
+    "resource://gre/actors/PageExtractorParent.sys.mjs"
+  );
+  const { url, serverClosed } = serveOnce(
+    `
+      <!DOCTYPE html>
+      <html>
+        <head>
+          <meta charset="utf-8" />
+          <title>404 not found</title>
+        </head>
+        <body>
+          <div>404 page not found.</div>
+        </body>
+      </html>
+    `,
+    404
+  );
+
+  const text = await PageExtractorParent.getHeadlessExtractor(
+    url,
+    async pageExtractor => pageExtractor.getText()
+  );
+
+  is(
+    text,
+    "404 page not found.",
+    "The page's content is extracted even if it's a 404"
+  );
+
+  await serverClosed;
+});
+
+/**
+ * Test page extraction on a restricted page.
+ */
+add_task(async function test_headless_extraction_about_blank() {
+  const { PageExtractorParent } = ChromeUtils.importESModule(
+    "resource://gre/actors/PageExtractorParent.sys.mjs"
+  );
+
+  await Assert.rejects(
+    PageExtractorParent.getHeadlessExtractor("about:blank", () => {}),
+    /about: pages are not supported/,
+    "PageExtractor fails on about: pages."
+  );
+});
+
+/**
+ * Test page extraction on a file URL.
+ */
+add_task(async function test_headless_extraction_about_blank() {
+  const { PageExtractorParent } = ChromeUtils.importESModule(
+    "resource://gre/actors/PageExtractorParent.sys.mjs"
+  );
+
+  await Assert.rejects(
+    PageExtractorParent.getHeadlessExtractor(
+      "file:///NeverGonnaGiveYouUp.mp4",
+      () => {}
+    ),
+    /file: pages are not supported/,
+    "PageExtractor fails on file: URLs."
+  );
+});
diff --git a/toolkit/components/pageextractor/tests/browser/head.js b/toolkit/components/pageextractor/tests/browser/head.js
@@ -4,6 +4,11 @@
 const BLANK_PAGE =
   "data:text/html;charset=utf-8,<!DOCTYPE html><title>Blank</title>Blank page";
 
+/** @type {import("../../../../../netwerk/test/httpserver/httpd.sys.mjs")} */
+const { HttpServer } = ChromeUtils.importESModule(
+  "resource://testing-common/httpd.sys.mjs"
+);
+
 /**
  * Use a tagged template literal to create a page extraction actor test. This spins
  * up an http server that serves the markup in a new tab. The page extractor can then
@@ -66,20 +71,18 @@ async function html(strings, ...values) {
  * Start an HTTP server that serves page.html with the provided HTML.
  *
  * @param {string} html
+ * @param {number} statusCode
  */
-function serveOnce(html) {
-  /** @type {import("../../../../../netwerk/test/httpserver/httpd.sys.mjs")} */
-  const { HttpServer } = ChromeUtils.importESModule(
-    "resource://testing-common/httpd.sys.mjs"
-  );
+function serveOnce(html, statusCode = 200) {
   info("Create server");
   const server = new HttpServer();
 
   const { promise, resolve } = Promise.withResolvers();
 
-  server.registerPathHandler("/page.html", (_request, response) => {
+  server.registerPathHandler("/page.html", (request, response) => {
     info("Request received for: " + url);
     response.setHeader("Content-Type", "text/html");
+    response.setStatusLine(request.httpVersion, statusCode);
     response.write(html);
     resolve(server.stop());
   });
diff --git a/toolkit/modules/ActorManagerParent.sys.mjs b/toolkit/modules/ActorManagerParent.sys.mjs
@@ -472,9 +472,6 @@ let JSWINDOWACTORS = {
     },
     child: {
       esModuleURI: "resource://gre/actors/PageExtractorChild.sys.mjs",
-      events: {
-        DOMContentLoaded: { createActor: false },
-      },
     },
     matches: [
       "http://*/*",
@@ -484,7 +481,7 @@ let JSWINDOWACTORS = {
       "data:text/html,*",
       "about:reader?*",
     ],
-    messageManagerGroups: ["browsers"],
+    messageManagerGroups: ["browsers", "headless-browsers"],
   },
 
   PopupAndRedirectBlocking: {

	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE

M	toolkit/components/pageextractor/PageExtractorChild.sys.mjs	\|	54	+++++++++++++++++++++++++++++++++++++-----------------
M	toolkit/components/pageextractor/PageExtractorParent.sys.mjs	\|	126	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	toolkit/components/pageextractor/tests/browser/browser.toml	\|	2	++
A	toolkit/components/pageextractor/tests/browser/browser_headless_extractor.js	\|	105	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	toolkit/components/pageextractor/tests/browser/head.js	\|	15	+++++++++------
M	toolkit/modules/ActorManagerParent.sys.mjs	\|	5	+----