commit 1733049bed20fd96ff84a9a9f252f3b1e319b681
parent 7279cec1892bb643802f0bad6d992e25d147f85a
Author: Erik Nordin <enordin@mozilla.com>
Date: Wed, 22 Oct 2025 16:50:27 +0000
Bug 1995314 - Ensure hidden text nodes are ignored during extraction r=gregtatum
This patch updates the page extraction algorithm to consider whether
all encountered nodes are hidden, rather tha only HTML elements.
This improves the overall extraction algorithm, ensuring that only
page text that is truly visible to the user is extracted.
Differential Revision: https://phabricator.services.mozilla.com/D269234
Diffstat:
2 files changed, 120 insertions(+), 7 deletions(-)
diff --git a/toolkit/components/pageextractor/DOMExtractor.sys.mjs b/toolkit/components/pageextractor/DOMExtractor.sys.mjs
@@ -102,6 +102,10 @@ class ExtractionContext {
this.#processedNodes.add(node);
+ if (isNodeHidden(node)) {
+ return;
+ }
+
const element = asHTMLElement(node);
const text = asTextNode(node);
let innerText = "";
@@ -279,13 +283,19 @@ function nodeNeedsSubdividing(node) {
}
/**
- * Returns true if an HTML element is hidden based on factors such as collapsed state and
+ * Returns true if a node is hidden based on factors such as collapsed state and
* computed style, otherwise false.
*
- * @param {HTMLElement} element
+ * @param {Node} node
* @returns {boolean}
*/
-function isHTMLElementHidden(element) {
+function isNodeHidden(node) {
+ const element = getHTMLElementForStyle(node);
+
+ if (!element) {
+ return true;
+ }
+
// This is a cheap and easy check that will not compute style or force reflow.
if (element.hidden) {
// The element is explicitly hidden.
@@ -445,10 +455,6 @@ function subdivideAndExtractText(node, context) {
if (shadowRoot) {
processSubdivide(shadowRoot, context);
} else {
- const element = asHTMLElement(node);
- if (element && isHTMLElementHidden(element)) {
- break;
- }
context.maybeAppendTextContent(node);
}
break;
@@ -596,3 +602,31 @@ function asHTMLElement(node) {
}
return null;
}
+
+/**
+ * This function returns the correct element to determine the
+ * style of node.
+ *
+ * @param {Node} node
+ *
+ * @returns {HTMLElement | null}
+ */
+function getHTMLElementForStyle(node) {
+ const element = asHTMLElement(node);
+ if (element) {
+ return element;
+ }
+
+ if (node.parentElement) {
+ return asHTMLElement(node.parentElement);
+ }
+
+ // For cases like text node where its parent is ShadowRoot,
+ // we'd like to use flattenedTreeParentNode
+ if (node.flattenedTreeParentNode) {
+ return asHTMLElement(node.flattenedTreeParentNode);
+ }
+
+ // If the text node is not connected or doesn't have a frame.
+ return null;
+}
diff --git a/toolkit/components/pageextractor/tests/browser/browser_dom_extractor.js b/toolkit/components/pageextractor/tests/browser/browser_dom_extractor.js
@@ -83,3 +83,82 @@ add_task(async function test_dom_extractor_sufficient_length_option() {
return cleanup();
});
+
+add_task(
+ async function test_dom_extractor_ignores_hidden_and_collapsed_nodes() {
+ const { actor, cleanup } = await html`
+ <article>
+ <!-- Visible header -->
+ <h1>Visible Title</h1>
+
+ <!-- Visible paragraph -->
+ <p>Visible paragraph</p>
+
+ <!-- Hidden via the [hidden] attribute -->
+ <p hidden>Hidden via [hidden]</p>
+
+ <!-- Hidden via display:none -->
+ <p style="display:none">Hidden via display:none</p>
+
+ <!-- Hidden via visibility:hidden -->
+ <p style="visibility:hidden">Hidden via visibility:hidden</p>
+
+ <!-- Hidden via opacity:0 -->
+ <p style="opacity:0">Hidden via opacity:0</p>
+
+ <!-- Visible block within hidden inline container -->
+ <span style="width:0; height:0; overflow:hidden">
+ <div>Block text within zero-sized inline container</div>
+ </span>
+
+ <!-- Hidden block container with inline descendant -->
+ <div hidden>
+ Hidden container outer text
+ <span>Hidden container inner text</span>
+ </div>
+
+ <!-- Visible block container with hidden inline descendant -->
+ <div>
+ Visible container outer text (hidden descendant)
+ <span hidden>Hidden child text in visible container</span>
+ </div>
+
+ <!-- Hidden inline container with block descendant -->
+ <span hidden>
+ Hidden inline outer text
+ <div>Hidden inline inner text</div>
+ </span>
+
+ <!-- Visible inline container with hidden block descendant -->
+ <span>
+ Visible inline outer text (hidden descendant)
+ <div hidden>Hidden block descendant text</div>
+ </span>
+
+ <!-- Collapsed <details> with <summary> still visible -->
+ <details>
+ <summary>Summary is visible</summary>
+ <div>Hidden inside closed details</div>
+ Text node directly under closed details (hidden)
+ </details>
+ </article>
+ `;
+
+ const expected = [
+ "Visible Title",
+ "Visible paragraph",
+ "Block text within zero-sized inline container",
+ "Visible container outer text (hidden descendant)",
+ "Visible inline outer text (hidden descendant)",
+ "Summary is visible",
+ ].join("\n");
+
+ is(
+ await actor.getText(),
+ expected,
+ "The extractor returns only visible text."
+ );
+
+ return cleanup();
+ }
+);