commit fde651f861f2afc281f5f9c4a581e93c85e1eca0
parent 3e28bf17eedf93851c338e8b024b98849e7468f4
Author: Greg Tatum <tatum.creative@gmail.com>
Date: Mon, 8 Dec 2025 15:34:00 +0000
Bug 1998170 - Just capture the viewport in the PageExtractor r=ai-ondevice-reviewers,tarek
Differential Revision: https://phabricator.services.mozilla.com/D273489
Diffstat:
5 files changed, 108 insertions(+), 11 deletions(-)
diff --git a/toolkit/components/pageextractor/DOMExtractor.sys.mjs b/toolkit/components/pageextractor/DOMExtractor.sys.mjs
@@ -5,7 +5,7 @@
// @ts-check
/**
- * @import { GetTextOptions } from './PageExtractor.js'
+ * @import { GetTextOptions } from './PageExtractor.d.ts'
*/
/**
@@ -34,12 +34,31 @@ class ExtractionContext {
#textContent = "";
/**
+ * When extracting content just from the viewport, this value will be set.
+ *
+ * @type {{ top: number; left: number; right: number; bottom: number } | null}
+ */
+ #viewportRect = null;
+
+ /**
* Constructs a new extraction context with the provided options.
*
+ * @param {Document} document
* @param {GetTextOptions} options
*/
- constructor(options) {
+ constructor(document, options) {
this.#options = options;
+
+ if (options.justViewport) {
+ const { visualViewport } = document.defaultView;
+ const { offsetTop, offsetLeft, width, height } = visualViewport;
+ this.#viewportRect = {
+ top: offsetTop,
+ left: offsetLeft,
+ right: offsetLeft + width,
+ bottom: offsetTop + height,
+ };
+ }
}
/**
@@ -90,6 +109,34 @@ class ExtractionContext {
}
/**
+ * When capturing content only in the viewport, skip nodes that are outside of it.
+ *
+ * @param {Node} node
+ */
+ maybeOutOfViewport(node) {
+ if (!this.#viewportRect) {
+ // We don't have a viewport rect, so skip this check.
+ return false;
+ }
+ const element = getHTMLElementForStyle(node);
+ if (!element) {
+ return false;
+ }
+
+ const rect = element.getBoundingClientRect();
+ if (!rect) {
+ return false;
+ }
+
+ return (
+ rect.bottom <= this.#viewportRect.top ||
+ rect.top >= this.#viewportRect.bottom ||
+ rect.right <= this.#viewportRect.left ||
+ rect.left >= this.#viewportRect.right
+ );
+ }
+
+ /**
* Append the node's text content to the accumulated text only if the node
* itself as well as no ancestor of the node has already been processed.
*
@@ -106,6 +153,11 @@ class ExtractionContext {
return;
}
+ if (this.maybeOutOfViewport(node)) {
+ // This only can return true when we're capturing just the viewport nodes.
+ return;
+ }
+
const element = asHTMLElement(node);
const text = asTextNode(node);
let innerText = "";
@@ -136,11 +188,11 @@ class ExtractionContext {
* @returns {string}
*/
export function extractTextFromDOM(document, options) {
- const context = new ExtractionContext(options);
+ const context = new ExtractionContext(document, options);
subdivideAndExtractText(document.body, context);
- return context.textContent;
+ return context.textContent.trim();
}
/**
@@ -349,7 +401,6 @@ function isNodeHidden(node) {
}
// This is an issue with the DOM library generation.
- // @ts-expect-error Property 'display' does not exist on type 'CSSStyleDeclaration'.ts(2339)
const { display, visibility, opacity } = style;
return (
diff --git a/toolkit/components/pageextractor/PageExtractorChild.sys.mjs b/toolkit/components/pageextractor/PageExtractorChild.sys.mjs
@@ -5,7 +5,7 @@
// @ts-check
/**
- * @import { GetTextOptions } from './PageExtractor.js'
+ * @import { GetTextOptions } from './PageExtractor.d.ts'
* @import { PageExtractorParent } from './PageExtractorParent.sys.mjs'
*/
@@ -140,16 +140,12 @@ export class PageExtractorChild extends JSWindowActorChild {
throw new Error("Boilerplate removal is not supported yet.");
}
- if (options.justViewport) {
- throw new Error("Just getting the viewport is not supported yet.");
- }
-
const text = lazy.extractTextFromDOM(document, options);
lazy.console.log("GetText", options);
lazy.console.debug(text);
- return text.trim();
+ return text;
}
/**
diff --git a/toolkit/components/pageextractor/tests/browser/browser.toml b/toolkit/components/pageextractor/tests/browser/browser.toml
@@ -17,3 +17,5 @@ skip-if = [
]
["browser_headless_extractor.js"]
+
+["browser_viewport_extractor.js"]
diff --git a/toolkit/components/pageextractor/tests/browser/browser_viewport_extractor.js b/toolkit/components/pageextractor/tests/browser/browser_viewport_extractor.js
@@ -0,0 +1,46 @@
+/* Any copyright is dedicated to the Public Domain.
+ https://creativecommons.org/publicdomain/zero/1.0/ */
+
+"use strict";
+
+add_task(async function test_viewport_extraction() {
+ const { actor, cleanup, tab } = await html`
+ <style>
+ body {
+ margin: 0;
+ }
+ .page {
+ margin-bottom: 20px;
+ height: 100vh;
+ box-sizing: border-box;
+ }
+ </style>
+ <div class="page" id="page-1">Viewport page 1</div>
+ <div class="page" id="page-2">Viewport page 2</div>
+ <div class="page" id="page-3">Viewport page 3</div>
+ `;
+
+ is(
+ await actor.getText({ justViewport: true }),
+ "Viewport page 1",
+ "Viewport-only extraction returns the first page."
+ );
+
+ await SpecialPowers.spawn(tab.linkedBrowser, [], async () => {
+ content.document.getElementById("page-2").scrollIntoView();
+ });
+
+ is(
+ await actor.getText({ justViewport: true }),
+ "Viewport page 2",
+ "Viewport extraction follows the current scroll position."
+ );
+
+ is(
+ await actor.getText(),
+ ["Viewport page 1", "Viewport page 2", "Viewport page 3"].join("\n"),
+ "Full document extraction includes all content."
+ );
+
+ await cleanup();
+});
diff --git a/toolkit/components/pageextractor/tests/browser/head.js b/toolkit/components/pageextractor/tests/browser/head.js
@@ -48,6 +48,8 @@ async function html(strings, ...values) {
*/
actor,
+ tab,
+
/**
* Get a new page extractor, which can change when navigating pages.
*