tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

commit fde651f861f2afc281f5f9c4a581e93c85e1eca0
parent 3e28bf17eedf93851c338e8b024b98849e7468f4
Author: Greg Tatum <tatum.creative@gmail.com>
Date:   Mon,  8 Dec 2025 15:34:00 +0000

Bug 1998170 - Just capture the viewport in the PageExtractor r=ai-ondevice-reviewers,tarek

Differential Revision: https://phabricator.services.mozilla.com/D273489

Diffstat:
Mtoolkit/components/pageextractor/DOMExtractor.sys.mjs | 61++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
Mtoolkit/components/pageextractor/PageExtractorChild.sys.mjs | 8++------
Mtoolkit/components/pageextractor/tests/browser/browser.toml | 2++
Atoolkit/components/pageextractor/tests/browser/browser_viewport_extractor.js | 46++++++++++++++++++++++++++++++++++++++++++++++
Mtoolkit/components/pageextractor/tests/browser/head.js | 2++
5 files changed, 108 insertions(+), 11 deletions(-)

diff --git a/toolkit/components/pageextractor/DOMExtractor.sys.mjs b/toolkit/components/pageextractor/DOMExtractor.sys.mjs @@ -5,7 +5,7 @@ // @ts-check /** - * @import { GetTextOptions } from './PageExtractor.js' + * @import { GetTextOptions } from './PageExtractor.d.ts' */ /** @@ -34,12 +34,31 @@ class ExtractionContext { #textContent = ""; /** + * When extracting content just from the viewport, this value will be set. + * + * @type {{ top: number; left: number; right: number; bottom: number } | null} + */ + #viewportRect = null; + + /** * Constructs a new extraction context with the provided options. * + * @param {Document} document * @param {GetTextOptions} options */ - constructor(options) { + constructor(document, options) { this.#options = options; + + if (options.justViewport) { + const { visualViewport } = document.defaultView; + const { offsetTop, offsetLeft, width, height } = visualViewport; + this.#viewportRect = { + top: offsetTop, + left: offsetLeft, + right: offsetLeft + width, + bottom: offsetTop + height, + }; + } } /** @@ -90,6 +109,34 @@ class ExtractionContext { } /** + * When capturing content only in the viewport, skip nodes that are outside of it. + * + * @param {Node} node + */ + maybeOutOfViewport(node) { + if (!this.#viewportRect) { + // We don't have a viewport rect, so skip this check. + return false; + } + const element = getHTMLElementForStyle(node); + if (!element) { + return false; + } + + const rect = element.getBoundingClientRect(); + if (!rect) { + return false; + } + + return ( + rect.bottom <= this.#viewportRect.top || + rect.top >= this.#viewportRect.bottom || + rect.right <= this.#viewportRect.left || + rect.left >= this.#viewportRect.right + ); + } + + /** * Append the node's text content to the accumulated text only if the node * itself as well as no ancestor of the node has already been processed. * @@ -106,6 +153,11 @@ class ExtractionContext { return; } + if (this.maybeOutOfViewport(node)) { + // This only can return true when we're capturing just the viewport nodes. + return; + } + const element = asHTMLElement(node); const text = asTextNode(node); let innerText = ""; @@ -136,11 +188,11 @@ class ExtractionContext { * @returns {string} */ export function extractTextFromDOM(document, options) { - const context = new ExtractionContext(options); + const context = new ExtractionContext(document, options); subdivideAndExtractText(document.body, context); - return context.textContent; + return context.textContent.trim(); } /** @@ -349,7 +401,6 @@ function isNodeHidden(node) { } // This is an issue with the DOM library generation. - // @ts-expect-error Property 'display' does not exist on type 'CSSStyleDeclaration'.ts(2339) const { display, visibility, opacity } = style; return ( diff --git a/toolkit/components/pageextractor/PageExtractorChild.sys.mjs b/toolkit/components/pageextractor/PageExtractorChild.sys.mjs @@ -5,7 +5,7 @@ // @ts-check /** - * @import { GetTextOptions } from './PageExtractor.js' + * @import { GetTextOptions } from './PageExtractor.d.ts' * @import { PageExtractorParent } from './PageExtractorParent.sys.mjs' */ @@ -140,16 +140,12 @@ export class PageExtractorChild extends JSWindowActorChild { throw new Error("Boilerplate removal is not supported yet."); } - if (options.justViewport) { - throw new Error("Just getting the viewport is not supported yet."); - } - const text = lazy.extractTextFromDOM(document, options); lazy.console.log("GetText", options); lazy.console.debug(text); - return text.trim(); + return text; } /** diff --git a/toolkit/components/pageextractor/tests/browser/browser.toml b/toolkit/components/pageextractor/tests/browser/browser.toml @@ -17,3 +17,5 @@ skip-if = [ ] ["browser_headless_extractor.js"] + +["browser_viewport_extractor.js"] diff --git a/toolkit/components/pageextractor/tests/browser/browser_viewport_extractor.js b/toolkit/components/pageextractor/tests/browser/browser_viewport_extractor.js @@ -0,0 +1,46 @@ +/* Any copyright is dedicated to the Public Domain. + https://creativecommons.org/publicdomain/zero/1.0/ */ + +"use strict"; + +add_task(async function test_viewport_extraction() { + const { actor, cleanup, tab } = await html` + <style> + body { + margin: 0; + } + .page { + margin-bottom: 20px; + height: 100vh; + box-sizing: border-box; + } + </style> + <div class="page" id="page-1">Viewport page 1</div> + <div class="page" id="page-2">Viewport page 2</div> + <div class="page" id="page-3">Viewport page 3</div> + `; + + is( + await actor.getText({ justViewport: true }), + "Viewport page 1", + "Viewport-only extraction returns the first page." + ); + + await SpecialPowers.spawn(tab.linkedBrowser, [], async () => { + content.document.getElementById("page-2").scrollIntoView(); + }); + + is( + await actor.getText({ justViewport: true }), + "Viewport page 2", + "Viewport extraction follows the current scroll position." + ); + + is( + await actor.getText(), + ["Viewport page 1", "Viewport page 2", "Viewport page 3"].join("\n"), + "Full document extraction includes all content." + ); + + await cleanup(); +}); diff --git a/toolkit/components/pageextractor/tests/browser/head.js b/toolkit/components/pageextractor/tests/browser/head.js @@ -48,6 +48,8 @@ async function html(strings, ...values) { */ actor, + tab, + /** * Get a new page extractor, which can change when navigating pages. *