[ tor-browser ].git.dasho

commit 66f995ee957d644ff39a28036d5fb56ab752df7e
parent c306bc6bee5aeac0c2b850abbeb3faf05f623ade
Author: Greg Tatum <tatum.creative@gmail.com>
Date:   Fri,  3 Oct 2025 18:47:54 +0000

Bug 1990614 - Stub out the PageExtractor actor and the DOMExtractor implementation; r=nordzilla

This actor will have various extraction techniques added to it. For now
I'm starting with Reader Mode text content and a custom DOMExtractor
that is based off of the translations DOM Walker and block splitting
behavior.

My plan is to iterate on this base to provide more features and refine
the algorithm. This will make it easy to do a test-driven approach since
once the extraction is integrated, it can be a bit opaque on what the
behavior is on different pages.

Differential Revision: https://phabricator.services.mozilla.com/D266071

Diffstat:
M toolkit/components/moz.build  | 1 +
A toolkit/components/pageextractor/DOMExtractor.sys.mjs  | 500 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A toolkit/components/pageextractor/PageExtractor.d.ts  | 10 ++++++++++
A toolkit/components/pageextractor/PageExtractorChild.sys.mjs  | 125 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A toolkit/components/pageextractor/PageExtractorParent.sys.mjs  | 42 ++++++++++++++++++++++++++++++++++++++++++
A toolkit/components/pageextractor/moz.build  | 17 +++++++++++++++++
A toolkit/components/pageextractor/tests/browser/browser.toml  | 9 +++++++++
A toolkit/components/pageextractor/tests/browser/browser_dom_extractor.js  | 37 +++++++++++++++++++++++++++++++++++++
A toolkit/components/pageextractor/tests/browser/head.js  | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M toolkit/modules/ActorManagerParent.sys.mjs  | 20 ++++++++++++++++++++

10 files changed, 839 insertions(+), 0 deletions(-)
diff --git a/toolkit/components/moz.build b/toolkit/components/moz.build
@@ -57,6 +57,7 @@ DIRS += [
     "ml",
     "mozintl",
     "mozprotocol",
+    "pageextractor",
     "parentalcontrols",
     "passwordmgr",
     "pdfjs",
diff --git a/toolkit/components/pageextractor/DOMExtractor.sys.mjs b/toolkit/components/pageextractor/DOMExtractor.sys.mjs
@@ -0,0 +1,500 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/. */
+
+// @ts-check
+
+/**
+ * @param {Document} document
+ * @returns {string}
+ */
+export function extractTextFromDOM(document) {
+  const blocks = subdivideNodeIntoBlocks(document.body);
+
+  let textContent = "";
+  for (const block of blocks) {
+    let innerText = "";
+    const element = asHTMLElement(block);
+    const text = asTextNode(block);
+
+    if (element) {
+      innerText = element.innerText.trim();
+    } else if (text?.nodeValue) {
+      innerText = text.nodeValue.trim();
+    }
+    if (innerText) {
+      textContent += "\n" + innerText;
+    }
+  }
+
+  return textContent;
+}
+
+/**
+ * Tags excluded from text extraction.
+ */
+const CONTENT_EXCLUDED_TAGS = new Set([
+  // TODO - We should add this and write some tests.
+  "CODE",
+
+  // The following are deprecated tags.
+  "DIR",
+  "APPLET",
+
+  // The following are embedded elements, and are not supported (yet).
+  "MATH",
+  "EMBED",
+  "OBJECT",
+  "IFRAME",
+
+  // This is an SVG tag that can contain arbitrary XML, ignore it.
+  "METADATA",
+
+  // These are elements that are treated as opaque by Firefox which causes their
+  // innerHTML property to be just the raw text node behind it. Any text that is sent as
+  // HTML must be valid, and there is no guarantee that the innerHTML is valid.
+  "NOSCRIPT",
+  "NOEMBED",
+  "NOFRAMES",
+
+  // Do not parse the HEAD tag.
+  "HEAD",
+
+  // These are not user-visible tags.
+  "STYLE",
+  "SCRIPT",
+  "TEMPLATE",
+]);
+
+const CONTENT_EXCLUDED_NODE_SELECTOR = [...CONTENT_EXCLUDED_TAGS].join(",");
+
+/**
+ * Get the ShadowRoot from the chrome-only openOrClosedShadowRoot API.
+ * This allows for extracting the content from WebComponents, which is not
+ * normally feasible in non-privileged contexts.
+ *
+ * @param {Node} node
+ *
+ * @returns {ShadowRoot | null}
+ */
+function getShadowRoot(node) {
+  return asElement(node)?.openOrClosedShadowRoot ?? null;
+}
+
+/**
+ * Determines if a node is ready for text extraction, or if it should be subdivided
+ * further. It doesn't check if the node has already been processed. This id done
+ * at the block level.
+ *
+ * @param {Node} node
+ * @returns {number} - NodeFilter acceptance status.
+ */
+function determineBlockStatus(node) {
+  if (!node) {
+    return NodeFilter.FILTER_REJECT;
+  }
+  if (getShadowRoot(node)) {
+    return NodeFilter.FILTER_ACCEPT;
+  }
+
+  if (isExcludedNode(node)) {
+    // This is an explicit.
+    return NodeFilter.FILTER_REJECT;
+  }
+
+  if (
+    containsExcludedNode(node, CONTENT_EXCLUDED_NODE_SELECTOR) &&
+    !hasNonWhitespaceTextNodes(node)
+  ) {
+    // Skip this node, and dig deeper into its tree to cut off smaller pieces to extract.
+    return NodeFilter.FILTER_SKIP;
+  }
+
+  if (nodeNeedsSubdividing(node)) {
+    // Skip this node, and dig deeper into its tree to cut off smaller pieces
+    // to extract. It is presumed to be a wrapper of block elements.
+    return NodeFilter.FILTER_SKIP;
+  }
+
+  // This textContent call is fairly expensive.
+  if (!node.textContent?.trim().length) {
+    // Do not use subtrees that are empty of text.
+    return !node.hasChildNodes()
+      ? NodeFilter.FILTER_REJECT
+      : NodeFilter.FILTER_SKIP;
+  }
+
+  // This node can be treated as entire block and is ready for text extraction.
+  return NodeFilter.FILTER_ACCEPT;
+}
+/**
+ * Determine if this element is an inline element or a block element.
+ *
+ * @param {Node} node
+ * @returns {boolean}
+ */
+function nodeNeedsSubdividing(node) {
+  const element = asElement(node);
+  if (!element) {
+    // Only elements need to be further subdivided.
+    return false;
+  }
+
+  for (let childNode of element.childNodes) {
+    if (!childNode) {
+      continue;
+    }
+    switch (childNode.nodeType) {
+      case Node.TEXT_NODE: {
+        // Keep checking for more inline or text nodes.
+        continue;
+      }
+      case Node.ELEMENT_NODE: {
+        if (getIsBlockLike(childNode)) {
+          // This node is a block node, so it needs further subdividing.
+          return true;
+        } else if (nodeNeedsSubdividing(childNode)) {
+          // This non-block-like node may contain other block-like nodes.
+          return true;
+        }
+
+        // Keep checking for more inline or text nodes.
+        continue;
+      }
+      default: {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+/**
+ * Returns true if an HTML element is hidden based on factors such as collapsed state and
+ * computed style, otherwise false.
+ *
+ * @param {HTMLElement} element
+ * @returns {boolean}
+ */
+function isHTMLElementHidden(element) {
+  // This is a cheap and easy check that will not compute style or force reflow.
+  if (element.hidden) {
+    // The element is explicitly hidden.
+    return true;
+  }
+
+  // Handle open/closed <details> elements. This will also not compute style or force reflow.
+  // https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/details
+  if (
+    // The element is within a closed <details>
+    element.closest("details:not([open])") &&
+    // The element is not part of the <summary> of the <details>, which is always visible, even when closed.
+    !element.closest("summary")
+  ) {
+    // The element is within a closed <details> and is not part of the <summary>, therefore it is not visible.
+    return true;
+  }
+
+  // This forces reflow, which has a performance cost, but this is also what JQuery uses for its :hidden and :visible.
+  // https://github.com/jquery/jquery/blob/bd6b453b7effa78b292812dbe218491624994526/src/css/hiddenVisibleSelectors.js#L1-L10
+  if (
+    !(
+      element.offsetWidth ||
+      element.offsetHeight ||
+      element.getClientRects().length
+    )
+  ) {
+    return true;
+  }
+
+  const { ownerGlobal } = element;
+  if (!ownerGlobal) {
+    // We cannot compute the style without ownerGlobal, so we will assume it is not visible.
+    return true;
+  }
+
+  // This flushes the style, which is a performance cost.
+  const style = ownerGlobal.getComputedStyle(element);
+  if (!style) {
+    // We were unable to compute the style, so we will assume it is not visible.
+    return true;
+  }
+
+  // This is an issue with the DOM library generation.
+  // @ts-expect-error Property 'display' does not exist on type 'CSSStyleDeclaration'.ts(2339)
+  const { display, visibility, opacity } = style;
+
+  return (
+    display === "none" ||
+    visibility === "hidden" ||
+    visibility === "collapse" ||
+    opacity === "0"
+  );
+}
+
+/**
+ * @param {Node} node
+ */
+function isExcludedNode(node) {
+  // Property access be expensive, so destructure required properties so they are
+  // not accessed multiple times.
+  const { nodeType } = node;
+
+  if (nodeType === Node.TEXT_NODE) {
+    // Text nodes are never excluded.
+    return false;
+  }
+  const element = asElement(node);
+  if (!element) {
+    // Only elements and and text nodes should be considered.
+    return true;
+  }
+
+  const { nodeName } = element;
+
+  if (CONTENT_EXCLUDED_TAGS.has(nodeName.toUpperCase())) {
+    // SVG tags can be lowercased, so ensure everything is uppercased.
+    // This is an excluded tag.
+    return true;
+  }
+
+  return false;
+}
+
+/**
+ * Like `#isExcludedNode` but looks at the full subtree. Used to see whether
+ * we can consider a subtree, or whether we should split it into smaller
+ * branches first to try to exclude more of the content.
+ *
+ * @param {Node} node
+ * @param {string} excludedNodeSelector
+ *
+ * @returns {boolean}
+ */
+function containsExcludedNode(node, excludedNodeSelector) {
+  return Boolean(asElement(node)?.querySelector(excludedNodeSelector));
+}
+
+/**
+ * Test whether any of the direct child text nodes of are non-whitespace text nodes.
+ *
+ * For example:
+ *   - `<p>test</p>`: yes
+ *   - `<p> </p>`: no
+ *   - `<p><b>test</b></p>`: no
+ *
+ * @param {Node} node
+ *
+ * @returns {boolean}
+ */
+function hasNonWhitespaceTextNodes(node) {
+  if (node.nodeType !== Node.ELEMENT_NODE) {
+    // Only check element nodes.
+    return false;
+  }
+
+  for (const child of node.childNodes) {
+    const textNode = asTextNode(child);
+    if (textNode) {
+      if (!textNode.textContent?.trim()) {
+        // This is just whitespace.
+        continue;
+      }
+      // A text node with content was found.
+      return true;
+    }
+  }
+
+  // No text nodes were found.
+  return false;
+}
+
+/**
+ * Start walking down through a node's subtree and decide which nodes to extract content
+ * from. This first node is the root of the page.
+ *
+ * The nodes go through a process of subdivision until an appropriate sized chunk
+ * of inline text can be found.
+ *
+ * @param {Node} node
+ * @returns {Set<Node>}
+ */
+function subdivideNodeIntoBlocks(node) {
+  /** @type {Set<Node>} */
+  const blocks = new Set();
+  switch (determineBlockStatus(node)) {
+    case NodeFilter.FILTER_REJECT: {
+      // This node is rejected as it shouldn't be used for text extraction.
+      return blocks;
+    }
+
+    // Either a shadow host or a block element
+    case NodeFilter.FILTER_ACCEPT: {
+      const shadowRoot = getShadowRoot(node);
+      if (shadowRoot) {
+        processSubdivide(shadowRoot, blocks);
+      } else {
+        const element = asHTMLElement(node);
+        if (element && isHTMLElementHidden(element)) {
+          break;
+        }
+        if (noAncestorsAdded(node, blocks)) {
+          blocks.add(node);
+        }
+      }
+      break;
+    }
+
+    case NodeFilter.FILTER_SKIP: {
+      // This node may have text to extract, but it needs to be subdivided into smaller
+      // pieces. Create a TreeWalker to walk the subtree, and find the subtrees/nodes
+      // that contain enough inline elements to extract.
+      processSubdivide(node, blocks);
+      break;
+    }
+  }
+  return blocks;
+}
+
+/**
+ * Add qualified nodes to have their text content extracted by recursively walking
+ * through the DOM tree of nodes, including elements in the Shadow DOM.
+ *
+ * @param {Node} node
+ * @param {Set<Node>} blocks
+ */
+function processSubdivide(node, blocks) {
+  const { ownerDocument } = node;
+  if (!ownerDocument) {
+    return;
+  }
+
+  // This iterator will contain each node that has been subdivided enough to have its
+  // text extracted.
+  const nodeIterator = ownerDocument.createTreeWalker(
+    node,
+    NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT,
+    determineBlockStatus
+  );
+
+  let currentNode;
+  while ((currentNode = nodeIterator.nextNode())) {
+    const shadowRoot = getShadowRoot(currentNode);
+    if (shadowRoot) {
+      processSubdivide(shadowRoot, blocks);
+    } else if (noAncestorsAdded(currentNode, blocks)) {
+      blocks.add(currentNode);
+    }
+  }
+}
+
+/**
+ * TODO - The original TranslationsDocument algorithm didn't require this, so perhaps
+ * something was not ported correctly. This should be removed to see if the error
+ * can be reproduced, and this mitigation removed.
+ *
+ * @param {Node} node
+ * @param {Set<Node>} blocks
+ */
+function noAncestorsAdded(node, blocks) {
+  for (const ancestor of getAncestorsIterator(node)) {
+    if (blocks.has(ancestor)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+/**
+ * Returns an iterator of a node's ancestors.
+ *
+ * @param {Node} node
+ *
+ * @yields {Node}
+ */
+function* getAncestorsIterator(node) {
+  const document = node.ownerDocument;
+  if (!document) {
+    return;
+  }
+  for (
+    let parent = node.parentNode;
+    parent && parent !== document.documentElement;
+    parent = parent.parentNode
+  ) {
+    yield parent;
+  }
+}
+
+/**
+ * Reads the elements computed style and determines if the element is a block-like
+ * element or not. Every element that lays out like a block should be used as a unit
+ * for text extraction.
+ *
+ * @param {Node} node
+ * @returns {boolean}
+ */
+function getIsBlockLike(node) {
+  const element = asElement(node);
+  if (!element) {
+    return false;
+  }
+
+  const { ownerGlobal } = element;
+  if (!ownerGlobal) {
+    return false;
+  }
+
+  if (element.namespaceURI === "http://www.w3.org/2000/svg") {
+    // SVG elements will report as inline, but there is no block layout in SVG.
+    // Treat every SVG element as being block so that every node will be subdivided.
+    return true;
+  }
+
+  /** @type {Record<string, string>} */
+  // @ts-expect-error - This is a workaround for the CSSStyleDeclaration not being indexable.
+  const style = ownerGlobal.getComputedStyle(element) ?? { display: null };
+
+  return style.display !== "inline" && style.display !== "none";
+}
+
+/**
+ * Use TypeScript to determine if the Node is an Element.
+ *
+ * @param {Node | null | undefined} node
+ * @returns {Element | null}
+ */
+function asElement(node) {
+  if (node?.nodeType === Node.ELEMENT_NODE) {
+    return /** @type {HTMLElement} */ (node);
+  }
+  return null;
+}
+
+/**
+ * Use TypeScript to determine if the Node is an Element.
+ *
+ * @param {Node | null} node
+ *
+ * @returns {Text | null}
+ */
+function asTextNode(node) {
+  if (node?.nodeType === Node.TEXT_NODE) {
+    return /** @type {Text} */ (node);
+  }
+  return null;
+}
+
+/**
+ * Use TypeScript to determine if the Node is an HTMLElement.
+ *
+ * @param {Node | null} node
+ *
+ * @returns {HTMLElement | null}
+ */
+function asHTMLElement(node) {
+  if (HTMLElement.isInstance(node)) {
+    return node;
+  }
+  return null;
+}
diff --git a/toolkit/components/pageextractor/PageExtractor.d.ts b/toolkit/components/pageextractor/PageExtractor.d.ts
@@ -0,0 +1,10 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+export interface GetTextOptions {
+  // Remove menus and other boilerplate.
+  removeBoilerplate: boolean;
+  // Just include the viewport content.
+  justViewport: boolean;
+}
diff --git a/toolkit/components/pageextractor/PageExtractorChild.sys.mjs b/toolkit/components/pageextractor/PageExtractorChild.sys.mjs
@@ -0,0 +1,125 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+// @ts-check
+
+/**
+ * @import { GetTextOptions } from './PageExtractor.js'
+ * @import { PageExtractorParent } from './PageExtractorParent.sys.mjs'
+ */
+
+/* eslint-disable jsdoc/require-property-description */
+
+/**
+ * @typedef {object} Lazy
+ * @property {typeof console} console
+ * @property {typeof import("resource://gre/modules/Readerable.sys.mjs").isProbablyReaderable} isProbablyReaderable
+ * @property {typeof import("moz-src:///toolkit/components/reader/ReaderMode.sys.mjs").ReaderMode} ReaderMode
+ * @property {typeof import("./DOMExtractor.sys.mjs").extractTextFromDOM} extractTextFromDOM
+ */
+
+/** @type {Lazy} */
+const lazy = /** @type {any} */ ({});
+
+ChromeUtils.defineLazyGetter(lazy, "console", () => {
+  return console.createInstance({
+    prefix: "PageExtractorChild",
+    maxLogLevelPref: "browser.ml.logLevel",
+  });
+});
+
+ChromeUtils.defineESModuleGetters(lazy, {
+  ReaderMode: "moz-src:///toolkit/components/reader/ReaderMode.sys.mjs",
+  extractTextFromDOM:
+    "moz-src:///toolkit/components/pageextractor/DOMExtractor.sys.mjs",
+  isProbablyReaderable: "resource://gre/modules/Readerable.sys.mjs",
+});
+
+/**
+ * Extract a variety of content from pages for use in a smart window.
+ */
+export class PageExtractorChild extends JSWindowActorChild {
+  /**
+   * Route the messages coming from the parent process.
+   *
+   * @param {object} message
+   * @param {string} message.name
+   * @param {any} message.data
+   *
+   * @returns {Promise<unknown>}
+   */
+  async receiveMessage({ name, data }) {
+    switch (name) {
+      case "PageExtractorParent:GetReaderModeContent":
+        return this.getReaderModeContent(data);
+      case "PageExtractorParent:GetText":
+        return this.getText(data);
+    }
+    return Promise.reject(new Error("Unknown message: " + name));
+  }
+
+  /**
+   * @see PageExtractorParent#getReaderModeContent for docs
+   *
+   * @param {boolean} force
+   * @returns {Promise<string | null>} text from the page
+   */
+  async getReaderModeContent(force) {
+    const window = this.browsingContext?.window;
+    const document = window?.document;
+
+    if (!force && (!document || !lazy.isProbablyReaderable(document))) {
+      return null;
+    }
+
+    if (!document) {
+      return "";
+    }
+
+    const article = await lazy.ReaderMode.parseDocument(document);
+    if (!article) {
+      return "";
+    }
+
+    const text = (article?.textContent || "")
+      .trim()
+      // Replace duplicate whitespace with either a single newline or space
+      .replace(/(\s*\n\s*)|\s{2,}/g, (_, newline) => (newline ? "\n" : " "));
+
+    lazy.console.log("GetReaderModeContent", { force });
+    lazy.console.debug(text);
+
+    return text;
+  }
+
+  /**
+   * @see PageExtractorParent#getText for docs
+   *
+   * @param {GetTextOptions} options
+   * @returns {string}
+   */
+  getText(options) {
+    const window = this.browsingContext?.window;
+    const document = window?.document;
+
+    if (!document) {
+      return "";
+    }
+
+    if (options.removeBoilerplate) {
+      throw new Error("Boilerplate removal is not supported yet.");
+    }
+
+    if (options.justViewport) {
+      throw new Error("Just getting the viewport is not supported yet.");
+    }
+
+    const text = lazy.extractTextFromDOM(document);
+
+    lazy.console.log("GetText", options);
+    lazy.console.debug(text);
+
+    return text.trim();
+  }
+}
diff --git a/toolkit/components/pageextractor/PageExtractorParent.sys.mjs b/toolkit/components/pageextractor/PageExtractorParent.sys.mjs
@@ -0,0 +1,42 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+// @ts-check
+
+/**
+ * @import { GetTextOptions } from './PageExtractor.d.ts'
+ * @import { PageExtractorChild } from './PageExtractorChild.sys.mjs'
+ */
+
+/**
+ * Extract a variety of content from pages for use in a smart window.
+ */
+export class PageExtractorParent extends JSWindowActorParent {
+  /**
+   * Returns ReaderMode content when the page passes the `isProbablyReaderable` check.
+   * The check can be bypassed to force page content to be retrieved by setting `force`
+   * to true.
+   *
+   * @see PageExtractorChild#getReaderModeContent
+   *
+   * @param {boolean} force - Bypass the `isProbablyReaderable` check.
+   * @returns {Promise<string | null>}
+   */
+  getReaderModeContent(force = false) {
+    return this.sendQuery("PageExtractorParent:GetReaderModeContent", force);
+  }
+
+  /**
+   * Gets the visible text from the page. This function is a bit smarter than just
+   * document.body.innerText. See GetTextOptions
+   *
+   * @see PageExtractorChild#getText
+   *
+   * @param {Partial<GetTextOptions>} options
+   * @returns {Promise<string | null>}
+   */
+  getText(options = {}) {
+    return this.sendQuery("PageExtractorParent:GetText", options);
+  }
+}
diff --git a/toolkit/components/pageextractor/moz.build b/toolkit/components/pageextractor/moz.build
@@ -0,0 +1,17 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+BROWSER_CHROME_MANIFESTS += ["tests/browser/browser.toml"]
+
+FINAL_TARGET_FILES.actors += [
+    "PageExtractorChild.sys.mjs",
+    "PageExtractorParent.sys.mjs",
+]
+
+MOZ_SRC_FILES += [
+    "DOMExtractor.sys.mjs",
+]
+
+with Files("**"):
+    BUG_COMPONENT = ("Core", "Machine Learning")
diff --git a/toolkit/components/pageextractor/tests/browser/browser.toml b/toolkit/components/pageextractor/tests/browser/browser.toml
@@ -0,0 +1,9 @@
+[DEFAULT]
+prefs = [
+  "browser.ml.logLevel=Info",
+]
+support-files = [
+  "head.js",
+]
+
+["browser_dom_extractor.js"]
diff --git a/toolkit/components/pageextractor/tests/browser/browser_dom_extractor.js b/toolkit/components/pageextractor/tests/browser/browser_dom_extractor.js
@@ -0,0 +1,37 @@
+/* Any copyright is dedicated to the Public Domain.
+   https://creativecommons.org/publicdomain/zero/1.0/ */
+
+"use strict";
+
+/**
+ * @import { BrowserTestUtils } from "../../../../../testing/mochitest/BrowserTestUtils/BrowserTestUtils.sys.mjs"
+ * @import { PageExtractorParent } from "../../PageExtractorParent.sys.mjs"
+ */
+
+add_task(async function test_dom_extractor() {
+  const { actor, cleanup } = await html`
+    <article>
+      <h1>Hello World</h1>
+      <p>This is a paragraph</p>
+    </article>
+  `;
+
+  is(
+    await actor.getText(),
+    ["Hello World", "This is a paragraph"].join("\n"),
+    "Text can be extracted from the page."
+  );
+
+  is(
+    await actor.getReaderModeContent(true /* force */),
+    "Hello World\nThis is a paragraph",
+    "Reader mode can extract page content."
+  );
+
+  is(
+    await actor.getReaderModeContent(),
+    null,
+    "Nothing is returned on non-reader mode content."
+  );
+  return cleanup();
+});
diff --git a/toolkit/components/pageextractor/tests/browser/head.js b/toolkit/components/pageextractor/tests/browser/head.js
@@ -0,0 +1,78 @@
+/* Any copyright is dedicated to the Public Domain.
+   https://creativecommons.org/publicdomain/zero/1.0/ */
+
+/**
+ * Use a tagged template literal to create a page extraction actor test. This spins
+ * up an http server that serves the markup in a new tab. The page extractor can then
+ * be used on the page.
+ *
+ * @param {TemplateStringsArray} strings - The literal string parts.
+ * @param {...any} values - The interpolated expressions.
+ */
+async function html(strings, ...values) {
+  // Convert the arguments into markup.
+  let markup = "";
+  for (let i = 0; i < strings.length; i++) {
+    markup += strings[i];
+    if (i < values.length) {
+      markup += values[i];
+    }
+  }
+
+  markup = `<!DOCTYPE html><body>${markup}</body>`;
+
+  const { url, serverClosed } = serveOnce(markup);
+
+  const tab = await BrowserTestUtils.openNewForegroundTab(
+    gBrowser,
+    url,
+    true // waitForLoad
+  );
+
+  /** @type {PageExtractorParent} */
+  const actor =
+    tab.linkedBrowser.browsingContext.currentWindowGlobal.getActor(
+      "PageExtractor"
+    );
+
+  return {
+    actor,
+    async cleanup() {
+      info("Cleaning up");
+      await serverClosed;
+      BrowserTestUtils.removeTab(tab);
+    },
+  };
+}
+
+/**
+ * Start an HTTP server that serves page.html with the provided HTML.
+ *
+ * @param {string} html
+ */
+function serveOnce(html) {
+  /** @type {import("../../../../../netwerk/test/httpserver/httpd.sys.mjs")} */
+  const { HttpServer } = ChromeUtils.importESModule(
+    "resource://testing-common/httpd.sys.mjs"
+  );
+  info("Create server");
+  const server = new HttpServer();
+
+  const { promise, resolve } = Promise.withResolvers();
+
+  server.registerPathHandler("/page.html", (_request, response) => {
+    info("Request received for: " + url);
+    response.setHeader("Content-Type", "text/html");
+    response.write(html);
+    resolve(server.stop());
+  });
+
+  server.start(-1);
+
+  let { primaryHost, primaryPort } = server.identity;
+  // eslint-disable-next-line @microsoft/sdl/no-insecure-url
+  const url = `http://${primaryHost}:${primaryPort}/page.html`;
+  info("Server listening for: " + url);
+
+  return { url, serverClosed: promise };
+}
diff --git a/toolkit/modules/ActorManagerParent.sys.mjs b/toolkit/modules/ActorManagerParent.sys.mjs
@@ -452,6 +452,26 @@ let JSWINDOWACTORS = {
     allFrames: true,
   },
 
+  PageExtractor: {
+    parent: {
+      esModuleURI: "resource://gre/actors/PageExtractorParent.sys.mjs",
+    },
+    child: {
+      esModuleURI: "resource://gre/actors/PageExtractorChild.sys.mjs",
+      events: {
+        DOMContentLoaded: { createActor: false },
+      },
+    },
+    matches: [
+      "http://*/*",
+      "https://*/*",
+      "file:///*",
+      "moz-extension://*",
+      "data:text/html,*",
+    ],
+    messageManagerGroups: ["browsers"],
+  },
+
   PopupAndRedirectBlocking: {
     parent: {
       esModuleURI:

	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE

M	toolkit/components/moz.build	\|	1	+
A	toolkit/components/pageextractor/DOMExtractor.sys.mjs	\|	500	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	toolkit/components/pageextractor/PageExtractor.d.ts	\|	10	++++++++++
A	toolkit/components/pageextractor/PageExtractorChild.sys.mjs	\|	125	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	toolkit/components/pageextractor/PageExtractorParent.sys.mjs	\|	42	++++++++++++++++++++++++++++++++++++++++++
A	toolkit/components/pageextractor/moz.build	\|	17	+++++++++++++++++
A	toolkit/components/pageextractor/tests/browser/browser.toml	\|	9	+++++++++
A	toolkit/components/pageextractor/tests/browser/browser_dom_extractor.js	\|	37	+++++++++++++++++++++++++++++++++++++
A	toolkit/components/pageextractor/tests/browser/head.js	\|	78	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	toolkit/modules/ActorManagerParent.sys.mjs	\|	20	++++++++++++++++++++