[ tor-browser ].git.dasho

commit c14535a918527d5324c7224f1782bbd7c9ec7b7f
parent 4cbcb6be76bc54b6210a261124f3dd63dede0f73
Author: Erik Nordin <enordin@mozilla.com>
Date:   Sat, 18 Oct 2025 02:16:51 +0000

Bug 1967758 - Rework DOM text extractor to have a context r=translations-reviewers,gregtatum

This patch reworks the DOM text extractor logic in two ways:

1) There is now a context that is passed through the recursion,
   which includes the configuration options that were passed.

2) The extracted text is not built incrementally during traversal,
   rather than traversing and then iterating through the set of
   nodes in two passes.

Both of these aspects are important for the following patch, which
will implement an early-exit strategy if a `sufficientLength` limit
is provided in the configuration options.

Differential Revision: https://phabricator.services.mozilla.com/D268283

Diffstat:
M toolkit/components/pageextractor/DOMExtractor.sys.mjs  | 154 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------
M toolkit/components/pageextractor/PageExtractorChild.sys.mjs  | 2 +-

2 files changed, 112 insertions(+), 44 deletions(-)
diff --git a/toolkit/components/pageextractor/DOMExtractor.sys.mjs b/toolkit/components/pageextractor/DOMExtractor.sys.mjs
@@ -5,29 +5,119 @@
 // @ts-check
 
 /**
- * @param {Document} document
- * @returns {string}
+ * @import { GetTextOptions } from './PageExtractor.js'
+ */
+
+/**
+ * The context for extracting text content from the DOM.
  */
-export function extractTextFromDOM(document) {
-  const blocks = subdivideNodeIntoBlocks(document.body);
+class ExtractionContext {
+  /**
+   * Set of nodes that have already been processed, used to avoid duplicating text extraction.
+   *
+   * @type {Set<Node>}
+   */
+  #processedNodes = new Set();
+
+  /**
+   * The text-extraction options, provided at initialization.
+   *
+   * @type {GetTextOptions}
+   */
+  #options;
+
+  /**
+   * The accumulated text content that has been extracted from the DOM.
+   *
+   * @type {string}
+   */
+  #textContent = "";
+
+  /**
+   * Constructs a new extraction context with the provided options.
+   *
+   * @param {GetTextOptions} options
+   */
+  constructor(options) {
+    this.#options = options;
+  }
+
+  /**
+   * Accumulated text content produced during traversal.
+   *
+   * @returns {string}
+   */
+  get textContent() {
+    return this.#textContent;
+  }
+
+  /**
+   * Returns true if this node or its ancestor's text content has
+   * already been extracted from the DOM.
+   *
+   * @param {Node} node
+   */
+  #isNodeProcessed(node) {
+    if (this.#processedNodes.has(node)) {
+      return true;
+    }
 
-  let textContent = "";
-  for (const block of blocks) {
+    for (const ancestor of getAncestorsIterator(node)) {
+      if (this.#processedNodes.has(ancestor)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  /**
+   * Append the node's text content to the accumulated text only if the node
+   * itself as well as no ancestor of the node has already been processed.
+   *
+   * @param {Node} node
+   */
+  maybeAppendTextContent(node) {
+    if (this.#isNodeProcessed(node)) {
+      return;
+    }
+
+    this.#processedNodes.add(node);
+
+    const element = asHTMLElement(node);
+    const text = asTextNode(node);
     let innerText = "";
-    const element = asHTMLElement(block);
-    const text = asTextNode(block);
 
     if (element) {
       innerText = element.innerText.trim();
     } else if (text?.nodeValue) {
       innerText = text.nodeValue.trim();
     }
+
     if (innerText) {
-      textContent += "\n" + innerText;
+      this.#textContent += "\n" + innerText;
     }
   }
+}
+
+/**
+ * Extracts visible text content from the DOM.
+ *
+ * By default, this extracts content from the entire page.
+ *
+ * Callers may specify filters for the extracted text via
+ * the supported options @see {GetTextOptions}.
+ *
+ * @param {Document} document
+ * @param {GetTextOptions} options
+ *
+ * @returns {string}
+ */
+export function extractTextFromDOM(document, options) {
+  const context = new ExtractionContext(options);
 
-  return textContent;
+  subdivideAndExtractText(document.body, context);
+
+  return context.textContent;
 }
 
 /**
@@ -317,30 +407,26 @@ function hasNonWhitespaceTextNodes(node) {
  * of inline text can be found.
  *
  * @param {Node} node
- * @returns {Set<Node>}
+ * @param {ExtractionContext} context
  */
-function subdivideNodeIntoBlocks(node) {
-  /** @type {Set<Node>} */
-  const blocks = new Set();
+function subdivideAndExtractText(node, context) {
   switch (determineBlockStatus(node)) {
     case NodeFilter.FILTER_REJECT: {
       // This node is rejected as it shouldn't be used for text extraction.
-      return blocks;
+      return;
     }
 
     // Either a shadow host or a block element
     case NodeFilter.FILTER_ACCEPT: {
       const shadowRoot = getShadowRoot(node);
       if (shadowRoot) {
-        processSubdivide(shadowRoot, blocks);
+        processSubdivide(shadowRoot, context);
       } else {
         const element = asHTMLElement(node);
         if (element && isHTMLElementHidden(element)) {
           break;
         }
-        if (noAncestorsAdded(node, blocks)) {
-          blocks.add(node);
-        }
+        context.maybeAppendTextContent(node);
       }
       break;
     }
@@ -349,11 +435,10 @@ function subdivideNodeIntoBlocks(node) {
       // This node may have text to extract, but it needs to be subdivided into smaller
       // pieces. Create a TreeWalker to walk the subtree, and find the subtrees/nodes
       // that contain enough inline elements to extract.
-      processSubdivide(node, blocks);
+      processSubdivide(node, context);
       break;
     }
   }
-  return blocks;
 }
 
 /**
@@ -361,9 +446,9 @@ function subdivideNodeIntoBlocks(node) {
  * through the DOM tree of nodes, including elements in the Shadow DOM.
  *
  * @param {Node} node
- * @param {Set<Node>} blocks
+ * @param {ExtractionContext} context
  */
-function processSubdivide(node, blocks) {
+function processSubdivide(node, context) {
   const { ownerDocument } = node;
   if (!ownerDocument) {
     return;
@@ -381,28 +466,11 @@ function processSubdivide(node, blocks) {
   while ((currentNode = nodeIterator.nextNode())) {
     const shadowRoot = getShadowRoot(currentNode);
     if (shadowRoot) {
-      processSubdivide(shadowRoot, blocks);
-    } else if (noAncestorsAdded(currentNode, blocks)) {
-      blocks.add(currentNode);
-    }
-  }
-}
-
-/**
- * TODO - The original TranslationsDocument algorithm didn't require this, so perhaps
- * something was not ported correctly. This should be removed to see if the error
- * can be reproduced, and this mitigation removed.
- *
- * @param {Node} node
- * @param {Set<Node>} blocks
- */
-function noAncestorsAdded(node, blocks) {
-  for (const ancestor of getAncestorsIterator(node)) {
-    if (blocks.has(ancestor)) {
-      return false;
+      processSubdivide(shadowRoot, context);
+    } else {
+      context.maybeAppendTextContent(currentNode);
     }
   }
-  return true;
 }
 
 /**
diff --git a/toolkit/components/pageextractor/PageExtractorChild.sys.mjs b/toolkit/components/pageextractor/PageExtractorChild.sys.mjs
@@ -115,7 +115,7 @@ export class PageExtractorChild extends JSWindowActorChild {
       throw new Error("Just getting the viewport is not supported yet.");
     }
 
-    const text = lazy.extractTextFromDOM(document);
+    const text = lazy.extractTextFromDOM(document, options);
 
     lazy.console.log("GetText", options);
     lazy.console.debug(text);

	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE

M	toolkit/components/pageextractor/DOMExtractor.sys.mjs	\|	154	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------
M	toolkit/components/pageextractor/PageExtractorChild.sys.mjs	\|	2	+-