tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

commit 738780f1f83bcfe282d30051b08c666d5b778353
parent 5ea2407af277878756eef5539935066c443bf35c
Author: Tom Zhang <tzhang@mozilla.com>
Date:   Thu, 18 Dec 2025 17:11:19 +0000

Bug 2004843 - Implement get_page_content tool r=gregtatum,ai-models-reviewers,tburrell

Differential Revision: https://phabricator.services.mozilla.com/D276302

Diffstat:
Mbrowser/components/aiwindow/models/Tools.sys.mjs | 284++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
Mbrowser/components/aiwindow/models/moz.build | 4++++
Abrowser/components/aiwindow/models/tests/browser/browser.toml | 6++++++
Abrowser/components/aiwindow/models/tests/browser/browser_get_page_content.js | 58++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Abrowser/components/aiwindow/models/tests/browser/head.js | 71+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Abrowser/components/aiwindow/models/tests/xpcshell/test_Tools_GetPageContent.js | 819+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mbrowser/components/aiwindow/models/tests/xpcshell/xpcshell.toml | 2++
7 files changed, 1228 insertions(+), 16 deletions(-)

diff --git a/browser/components/aiwindow/models/Tools.sys.mjs b/browser/components/aiwindow/models/Tools.sys.mjs @@ -5,10 +5,11 @@ */ /** - * This file contains LLM tool abscrations and tool definitions. + * This file contains LLM tool abstractions and tool definitions. */ import { searchBrowsingHistory as implSearchBrowsingHistory } from "moz-src:///browser/components/aiwindow/models/SearchBrowsingHistory.sys.mjs"; +import { PageExtractorParent } from "resource://gre/actors/PageExtractorParent.sys.mjs"; const lazy = {}; ChromeUtils.defineESModuleGetters(lazy, { @@ -19,18 +20,19 @@ ChromeUtils.defineESModuleGetters(lazy, { const GET_OPEN_TABS = "get_open_tabs"; const SEARCH_BROWSING_HISTORY = "search_browsing_history"; +const GET_PAGE_CONTENT = "get_page_content"; -// eslint-disable-next-line no-unused-vars -const TOOLS = [GET_OPEN_TABS, SEARCH_BROWSING_HISTORY]; +export const TOOLS = [GET_OPEN_TABS, SEARCH_BROWSING_HISTORY, GET_PAGE_CONTENT]; -// eslint-disable-next-line no-unused-vars -const toolsConfig = [ +export const toolsConfig = [ { type: "function", function: { name: GET_OPEN_TABS, description: - "Access the user's browser and return a list of most recently browsed tabs. Each tab is represented by a JSON with the page's url, title and description if available. Default to return maximum 15 tabs.", + "Access the user's browser and return a list of most recently browsed tabs. " + + "Each tab is represented by a JSON with the page's url, title and description " + + "if available. Default to return maximum 15 tabs.", parameters: { type: "object", properties: {}, @@ -42,25 +44,39 @@ const toolsConfig = [ function: { name: SEARCH_BROWSING_HISTORY, description: - 'Refind pages from the user\'s PAST BROWSING HISTORY. Use this whenever the user wants to recall, review, list, or see pages they visited earlier (for a topic, site, or time period). Also use this when the user requests all pages from a past time period (e.g., "yesterday", "last week"), even if no topic is specified. Do NOT use for open tabs, completely general web questions, or abstract questions about "history" or habits.', + "Refind pages from the user's PAST BROWSING HISTORY. Use this whenever the " + + "user wants to recall, review, list, or see pages they visited earlier (for a " + + "topic, site, or time period). Also use this when the user requests all pages " + + 'from a past time period (e.g., "yesterday", "last week"), even if no topic is ' + + "specified. Do NOT use for open tabs, completely general web questions, or " + + 'abstract questions about "history" or habits.', parameters: { type: "object", properties: { searchTerm: { type: "string", description: - "A detailed, noun-heavy phrase (~2-12 meaningful tokens) summarizing the user's intent for semantic retrieval. Include the main entity/topic plus 1-3 contextual qualifiers (e.g., library name, purpose, site, or timeframe). Avoid vague or single-word queries.", + "A detailed, noun-heavy phrase (~2-12 meaningful tokens) summarizing " + + "the user's intent for semantic retrieval. Include the main entity/topic " + + "plus 1-3 contextual qualifiers (e.g., library name, purpose, site, or " + + "timeframe). Avoid vague or single-word queries.", }, startTs: { type: "string", description: - "Inclusive lower bound of the time window as an ISO 8601 datetime string (e.g., '2025-11-07T09:00:00-05:00'). Use when the user asks for results within a time or range start, such as 'last week', 'since yesterday', or 'last night'. This must be before the user's current datetime.", + "Inclusive lower bound of the time window as an ISO 8601 datetime string " + + "(e.g., '2025-11-07T09:00:00-05:00'). Use when the user asks for results " + + "within a time or range start, such as 'last week', 'since yesterday', or" + + "'last night'. This must be before the user's current datetime.", default: null, }, endTs: { type: "string", description: - "Inclusive upper bound of the time window as an ISO 8601 datetime string (e.g., '2025-11-07T21:00:00-05:00'). Use when the user asks for results within a time or range end, such as 'last week', 'between 2025-10-01 and 2025-10-31', or 'before Monday'. This must be before the user's current datetime.", + "Inclusive upper bound of the time window as an ISO 8601 datetime string " + + "(e.g., '2025-11-07T21:00:00-05:00'). Use when the user asks for results " + + "within a time or range end, such as 'last week', 'between 2025-10-01 and " + + "2025-10-31', or 'before Monday'. This must be before the user's current datetime.", default: null, }, }, @@ -68,6 +84,26 @@ const toolsConfig = [ }, }, }, + { + type: "function", + function: { + name: GET_PAGE_CONTENT, + description: + "Retrieve cleaned text content of the provided browser page URL.", + parameters: { + properties: { + url: { + type: "string", + description: + "The complete URL of the page to fetch content from. This must exactly match " + + "a URL from the current conversation context. Use the full URL including " + + "protocol (http/https). Example: 'https://www.example.com/article'.", + }, + }, + required: ["url"], + }, + }, + }, ]; /** @@ -126,7 +162,7 @@ export async function getOpenTabs(n = 15) { } /** - * Tool entrypoint for browsing history search. + * Tool entrypoint for search_browsing_history. * * Parameters (defaults shown): * - searchTerm: "" - string used for search @@ -136,16 +172,16 @@ export async function getOpenTabs(n = 15) { * * Detailed behavior and implementation are in SearchBrowsingHistory.sys.mjs. * - * @param {object} params + * @param {object} toolParams * The search parameters. - * @param {string} params.searchTerm + * @param {string} toolParams.searchTerm * The search string. If null or empty, semantic search is skipped and * results are filtered by time range and sorted by last_visit_date and frecency. - * @param {string|null} params.startTs + * @param {string|null} toolParams.startTs * Optional ISO-8601 start timestamp (e.g. "2025-11-07T09:00:00-05:00"). - * @param {string|null} params.endTs + * @param {string|null} toolParams.endTs * Optional ISO-8601 end timestamp (e.g. "2025-11-07T09:00:00-05:00"). - * @param {number} params.historyLimit + * @param {number} toolParams.historyLimit * Maximum number of history results to return. * @returns {Promise<object>} * A promise resolving to an object with the search term and history results. @@ -200,3 +236,219 @@ export function stripSearchBrowsingHistoryFields(result) { return result; } } + +/** + * Class for handling page content extraction with configurable modes and limits. + */ +export class GetPageContent { + static DEFAULT_MODE = "reader"; + static FALLBACK_MODE = "full"; + static MAX_CHARACTERS = 10000; + + static MODE_HANDLERS = { + viewport: async pageExtractor => { + const result = await pageExtractor.getText({ justViewport: true }); + return { text: result.text }; + }, + reader: async pageExtractor => { + const text = await pageExtractor.getReaderModeContent(); + return { text: typeof text === "string" ? text : "" }; + }, + full: async pageExtractor => { + const result = await pageExtractor.getText(); + return { text: result }; + }, + }; + + /** + * Tool entrypoint for get_page_content. + * + * @param {object} toolParams + * @param {string} toolParams.url + * @param {Set<string>} allowedUrls + * @returns {Promise<string>} + * A promise resolving to a string containing the extracted page content + * with a descriptive header, or an error message if extraction fails. + */ + static async getPageContent({ url }, allowedUrls) { + try { + // Search through the allowed URLs and extract directly if exists + if (!allowedUrls.has(url)) { + // Bug 2006418 - This will load the page headlessly, and then extract the content. + // It might be a better idea to have the lifetime of the page be tied to the chat + // while it's open, and with a "keep alive" timeout. For now it's simpler to just + // load the page fresh every time. + return PageExtractorParent.getHeadlessExtractor(url, pageExtractor => + this.#runExtraction(pageExtractor, this.DEFAULT_MODE, url) + ); + } + + // TODO: figure out what windows we can access to give permission here, and update this API + let win = lazy.BrowserWindowTracker.getTopWindow(); + let gBrowser = win.gBrowser; + let tabs = gBrowser.tabs; + + // Find the tab with the matching URL in browser + let targetTab = null; + for (let i = 0; i < tabs.length; i++) { + const tab = tabs[i]; + const currentURI = tab?.linkedBrowser?.currentURI; + if (currentURI?.spec === url) { + targetTab = tab; + break; + } + } + + // If no match, try hostname matching for cases where protocols differ + if (!targetTab) { + try { + const inputHostPort = new URL(url).host; + targetTab = tabs.find(tab => { + try { + const tabHostPort = tab.linkedBrowser.currentURI.hostPort; + return tabHostPort === inputHostPort; + } catch { + return false; + } + }); + } catch { + // Invalid URL, continue with original logic + } + } + + // If still no match, abort + if (!targetTab) { + return `Cannot find URL: ${url}, page content extraction failed.`; + } + + // Attempt extraction + const currentWindowContext = + targetTab.linkedBrowser.browsingContext?.currentWindowContext; + + if (!currentWindowContext) { + return `Cannot access content from "${targetTab.label}" at ${url}.`; + // Stripped message "The tab may still be loading or is not accessible." to not confuse the LLM + } + + // Extract page content using PageExtractor + const pageExtractor = + await currentWindowContext.getActor("PageExtractor"); + + return this.#runExtraction( + pageExtractor, + this.DEFAULT_MODE, + `"${targetTab.label}" (${url})` + ); + } catch (error) { + // Bug 2006425 - Decide on the strategy for error handling in tool calls + // i.e., will the LLM keep retrying get_page_content due to error? + console.error(error); + return `Error retrieving content from ${url}.`; + // Stripped ${error.message} content to not confruse the LLM + } + } + + /** + * Main extraction function. + * label is of form `{tab.title} ({tab.url})`. + * + * @param {PageExtractor} pageExtractor + * @param {string} mode + * @param {string} label + * @returns {Promise<string>} + * A promise resolving to a formatted string containing the page content + * with mode and label information, or an error message if no content is available. + */ + static async #runExtraction(pageExtractor, mode, label) { + const selectedMode = + typeof mode === "string" && this.MODE_HANDLERS[mode] + ? mode + : this.DEFAULT_MODE; + const handler = this.MODE_HANDLERS[selectedMode]; + let extraction = null; + + try { + extraction = await handler(pageExtractor); + } catch (err) { + console.error( + "[SmartWindow] get_page_content mode failed", + selectedMode, + err + ); + } + + let pageContent = ""; + if (typeof extraction === "string") { + pageContent = extraction; + } else if (typeof extraction?.text === "string") { + pageContent = extraction.text; + } + + // Track which mode was actually used (in case we fall back) + let actualMode = selectedMode; + + // If reader mode returns no content, fall back to full mode + if (!pageContent && selectedMode === "reader") { + try { + const fallbackHandler = this.MODE_HANDLERS[this.FALLBACK_MODE]; + extraction = await fallbackHandler(pageExtractor); + if (typeof extraction === "string") { + pageContent = extraction; + } else if (typeof extraction?.text === "string") { + pageContent = extraction.text; + } + if (pageContent) { + actualMode = this.FALLBACK_MODE; + } + } catch (err) { + console.error( + "[SmartWindow] get_page_content fallback mode failed", + this.FALLBACK_MODE, + err + ); + } + } + + if (!pageContent) { + return `get_page_content(${selectedMode}) returned no content for ${label}.`; + // Stripped message "Try another mode if you still need information." to not confruse the LLM + } + + // Clean and truncate content for better LLM consumption + // Bug 2006436 - Consider doing this directly in pageExtractor if absolutely needed. + let cleanContent = pageContent + .replace(/\s+/g, " ") // Normalize whitespace + .replace(/\n\s*\n/g, "\n") // Clean up line breaks + .trim(); + + // Limit content length but be more generous for LLM processing + // Bug 1995043 - once reader mode has length truncation, + // we can remove this and directly do this in pageExtractor. + if (cleanContent.length > this.MAX_CHARACTERS) { + // Try to cut at a sentence boundary + const truncatePoint = cleanContent.lastIndexOf(".", this.MAX_CHARACTERS); + if (truncatePoint > this.MAX_CHARACTERS - 100) { + cleanContent = cleanContent.substring(0, truncatePoint + 1); + } else { + cleanContent = cleanContent.substring(0, this.MAX_CHARACTERS) + "..."; + } + } + + let modeLabel; + switch (actualMode) { + case "viewport": + modeLabel = "current viewport"; + break; + case "reader": + modeLabel = "reader mode"; + break; + case "full": + modeLabel = "full page"; + break; + } + + return `Content (${modeLabel}) from ${label}: + +${cleanContent}`; + } +} diff --git a/browser/components/aiwindow/models/moz.build b/browser/components/aiwindow/models/moz.build @@ -9,6 +9,10 @@ DIRS += [ "prompts", ] +BROWSER_CHROME_MANIFESTS += [ + "tests/browser/browser.toml", +] + MOZ_SRC_FILES += [ "Chat.sys.mjs", "ChatUtils.sys.mjs", diff --git a/browser/components/aiwindow/models/tests/browser/browser.toml b/browser/components/aiwindow/models/tests/browser/browser.toml @@ -0,0 +1,6 @@ +[DEFAULT] +support-files = [ + "head.js", +] + +["browser_get_page_content.js"] diff --git a/browser/components/aiwindow/models/tests/browser/browser_get_page_content.js b/browser/components/aiwindow/models/tests/browser/browser_get_page_content.js @@ -0,0 +1,58 @@ +/* Any copyright is dedicated to the Public Domain. + http://creativecommons.org/publicdomain/zero/1.0/ */ + +"use strict"; + +/** + * Test that the get_page_content tool call can extract content from a page. + */ +add_task(async function test_get_page_content_basic() { + const html = ` + <!DOCTYPE html> + <html> + <head> + <meta charset="utf-8"> + <title>Test Page</title> + </head> + <body> + <article> + <h1>Sample Article Title</h1> + <p>This is the first paragraph with some sample content.</p> + <p>This is the second paragraph with additional information.</p> + </article> + </body> + </html> + `; + + const { url, GetPageContent, cleanup } = await setupGetPageContentTest(html); + + // Create an allowed URLs set containing the test page + const allowedUrls = new Set([url]); + + // Call the tool with the URL + const result = await GetPageContent.getPageContent({ url }, allowedUrls); + + info("Extraction result: " + result); + + // Verify the result contains expected content + ok( + result.includes("Sample Article Title"), + "Result should contain the title" + ); + ok( + result.includes("first paragraph"), + "Result should contain text from the first paragraph" + ); + ok( + result.includes("second paragraph"), + "Result should contain text from the second paragraph" + ); + + // Verify the result indicates which extraction mode was used + ok( + result.startsWith("Content (") && result.includes(") from"), + "Result should indicate the extraction mode used" + ); + + await cleanup(); +}); diff --git a/browser/components/aiwindow/models/tests/browser/head.js b/browser/components/aiwindow/models/tests/browser/head.js @@ -0,0 +1,71 @@ +/* Any copyright is dedicated to the Public Domain. + http://creativecommons.org/publicdomain/zero/1.0/ */ + +"use strict"; + +/** + * Start an HTTP server that serves HTML content. + * + * @param {string} html - The HTML content to serve + * @returns {object} An object containing: + * - url: The URL where the content is served + * - serverClosed: Promise that resolves when the server stops + */ +function serveHTML(html) { + const { HttpServer } = ChromeUtils.importESModule( + "resource://testing-common/httpd.sys.mjs" + ); + + const server = new HttpServer(); + + server.registerPathHandler("/test-page.html", (_request, response) => { + response.setHeader("Content-Type", "text/html"); + response.write(html); + }); + + server.start(-1); + + const { primaryHost, primaryPort } = server.identity; + // eslint-disable-next-line @microsoft/sdl/no-insecure-url + const url = `http://${primaryHost}:${primaryPort}/test-page.html`; + + return { + url, + server, + }; +} + +/** + * Set up a test for the get_page_content tool call by serving HTML and loading it. + * + * @param {string} html - The HTML content to serve and test + * @returns {Promise<object>} An object containing: + * - tab: The opened browser tab + * - url: The URL of the loaded page + * - GetPageContent: The GetPageContent class + * - cleanup: Function to clean up the test + */ +async function setupGetPageContentTest(html) { + const { GetPageContent } = ChromeUtils.importESModule( + "moz-src:///browser/components/aiwindow/models/Tools.sys.mjs" + ); + + const { url, server } = serveHTML(html); + + const tab = await BrowserTestUtils.openNewForegroundTab( + gBrowser, + url, + true // waitForLoad + ); + + return { + tab, + url, + GetPageContent, + async cleanup() { + info("Cleaning up test"); + BrowserTestUtils.removeTab(tab); + await new Promise(resolve => server.stop(resolve)); + }, + }; +} diff --git a/browser/components/aiwindow/models/tests/xpcshell/test_Tools_GetPageContent.js b/browser/components/aiwindow/models/tests/xpcshell/test_Tools_GetPageContent.js @@ -0,0 +1,819 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +const { GetPageContent } = ChromeUtils.importESModule( + "moz-src:///browser/components/aiwindow/models/Tools.sys.mjs" +); + +const { sinon } = ChromeUtils.importESModule( + "resource://testing-common/Sinon.sys.mjs" +); + +function createFakeBrowser(url, hasBrowsingContext = true) { + const parsedUrl = new URL(url); + const browser = { + currentURI: { + spec: url, + hostPort: parsedUrl.host, + }, + }; + + if (hasBrowsingContext) { + browser.browsingContext = { + currentWindowContext: { + getActor: sinon.stub().resolves({ + getText: sinon.stub().resolves("Sample page content"), + getReaderModeContent: sinon.stub().resolves(""), + }), + }, + }; + } else { + browser.browsingContext = null; + } + + return browser; +} + +function createFakeTab(url, title, hasBrowsingContext = true) { + return { + linkedBrowser: createFakeBrowser(url, hasBrowsingContext), + label: title, + }; +} + +function createFakeWindow(tabs) { + return { + closed: false, + gBrowser: { + tabs, + }, + }; +} + +function setupBrowserWindowTracker(sandbox, window) { + const BrowserWindowTracker = ChromeUtils.importESModule( + "resource:///modules/BrowserWindowTracker.sys.mjs" + ).BrowserWindowTracker; + + sandbox.stub(BrowserWindowTracker, "getTopWindow").returns(window); +} + +add_task(async function test_getPageContent_exact_url_match() { + const sb = sinon.createSandbox(); + + try { + const targetUrl = "https://example.com/page"; + const tabs = [ + createFakeTab("https://other.com", "Other"), + createFakeTab(targetUrl, "Example Page"), + ]; + + setupBrowserWindowTracker(sb, createFakeWindow(tabs)); + + const result = await GetPageContent.getPageContent( + { url: targetUrl }, + new Set([targetUrl]) + ); + + Assert.ok(result.includes("Example Page"), "Should include page title"); + Assert.ok( + result.includes("Sample page content"), + "Should include page content" + ); + Assert.ok( + result.includes(targetUrl), + "Should include URL in result message" + ); + } finally { + sb.restore(); + } +}); + +add_task(async function test_getPageContent_normalized_url_match() { + const sb = sinon.createSandbox(); + + try { + const tabs = [ + createFakeTab("https://example.com/page/", "Example Page"), + createFakeTab("https://other.com", "Other"), + ]; + + setupBrowserWindowTracker(sb, createFakeWindow(tabs)); + + const result = await GetPageContent.getPageContent( + { url: "https://example.com/page" }, + new Set(["https://example.com/page"]) + ); + + Assert.ok( + result.includes("Example Page"), + "Should match URL after normalizing trailing slashes" + ); + Assert.ok( + result.includes("Sample page content"), + "Should include page content" + ); + } finally { + sb.restore(); + } +}); + +add_task(async function test_getPageContent_hostname_match() { + const sb = sinon.createSandbox(); + + try { + const tabs = [ + createFakeTab("https://example.com/page", "Example Page"), + createFakeTab("https://other.com", "Other"), + ]; + + setupBrowserWindowTracker(sb, createFakeWindow(tabs)); + + const result = await GetPageContent.getPageContent( + { url: "http://example.com/different" }, + new Set(["http://example.com/different"]) + ); + + Assert.ok( + result.includes("Example Page"), + "Should match by hostname when exact match fails" + ); + Assert.ok( + result.includes("Sample page content"), + "Should include page content" + ); + } finally { + sb.restore(); + } +}); + +add_task(async function test_getPageContent_tab_not_found_with_allowed_url() { + const sb = sinon.createSandbox(); + + try { + const targetUrl = "https://external.com/article"; + const tabs = [ + createFakeTab("https://example.com", "Example"), + createFakeTab("https://other.com", "Other"), + ]; + + setupBrowserWindowTracker(sb, createFakeWindow(tabs)); + + const allowedUrls = new Set([targetUrl]); + const result = await GetPageContent.getPageContent( + { url: targetUrl }, + allowedUrls + ); + + // Headless extraction doesn't work in xpcshell environment + // In real usage, this would attempt headless extraction for allowed URLs + Assert.ok( + result.includes("Cannot find URL"), + "Should return error when tab not found (headless doesn't work in xpcshell)" + ); + Assert.ok(result.includes(targetUrl), "Should include target URL in error"); + } finally { + sb.restore(); + } +}); + +add_task( + async function test_getPageContent_tab_not_found_without_allowed_url() { + const sb = sinon.createSandbox(); + + try { + const targetUrl = "https://notfound.com/page"; + const tabs = [ + createFakeTab("https://example.com", "Example"), + createFakeTab("https://other.com", "Other"), + createFakeTab("https://third.com", "Third"), + createFakeTab("https://fourth.com", "Fourth"), + ]; + + setupBrowserWindowTracker(sb, createFakeWindow(tabs)); + + const allowedUrls = new Set(["https://different.com"]); + + // When URL is not in allowedUrls, it attempts headless extraction + // This doesn't work in xpcshell, so we expect an error + let errorThrown = false; + try { + await GetPageContent.getPageContent({ url: targetUrl }, allowedUrls); + } catch (error) { + errorThrown = true; + Assert.ok( + error.message.includes("addProgressListener"), + "Should fail with headless browser error in xpcshell" + ); + } + + Assert.ok( + errorThrown, + "Should throw error when attempting headless extraction in xpcshell" + ); + } finally { + sb.restore(); + } + } +); + +add_task(async function test_getPageContent_no_browsing_context() { + const sb = sinon.createSandbox(); + + try { + const targetUrl = "https://example.com/loading"; + const tabs = [createFakeTab(targetUrl, "Loading Page", false)]; + + setupBrowserWindowTracker(sb, createFakeWindow(tabs)); + + const result = await GetPageContent.getPageContent( + { url: targetUrl }, + new Set([targetUrl]) + ); + + Assert.ok( + result.includes("Cannot access content"), + "Should return error for unavailable browsing context" + ); + Assert.ok( + result.includes("Loading Page"), + "Should include tab label in error" + ); + Assert.ok( + result.includes(targetUrl), + "Should include URL in error message" + ); + } finally { + sb.restore(); + } +}); + +add_task(async function test_getPageContent_successful_extraction() { + const sb = sinon.createSandbox(); + + try { + const targetUrl = "https://example.com/article"; + const pageContent = "This is a well-written article with lots of content."; + + const mockExtractor = { + getText: sinon.stub().resolves(pageContent), + getReaderModeContent: sinon.stub().resolves(""), + }; + + const tab = createFakeTab(targetUrl, "Article"); + tab.linkedBrowser.browsingContext.currentWindowContext.getActor = sinon + .stub() + .resolves(mockExtractor); + + setupBrowserWindowTracker(sb, createFakeWindow([tab])); + + const result = await GetPageContent.getPageContent( + { url: targetUrl }, + new Set([targetUrl]) + ); + + Assert.ok(result.includes("Content (full page)"), "Should indicate mode"); + Assert.ok(result.includes("Article"), "Should include tab title"); + Assert.ok(result.includes(targetUrl), "Should include URL"); + Assert.ok(result.includes(pageContent), "Should include extracted content"); + } finally { + sb.restore(); + } +}); + +add_task(async function test_getPageContent_content_truncation() { + const sb = sinon.createSandbox(); + + try { + const targetUrl = "https://example.com/long"; + const longContent = "A".repeat(15000); + + const mockExtractor = { + getText: sinon.stub().resolves(longContent), + getReaderModeContent: sinon.stub().resolves(""), + }; + + const tab = createFakeTab(targetUrl, "Long Page"); + tab.linkedBrowser.browsingContext.currentWindowContext.getActor = sinon + .stub() + .resolves(mockExtractor); + + setupBrowserWindowTracker(sb, createFakeWindow([tab])); + + const result = await GetPageContent.getPageContent( + { url: targetUrl }, + new Set([targetUrl]) + ); + + const contentMatch = result.match(/Content \(full page\) from.*:\s*(.*)/s); + Assert.ok(contentMatch, "Should match content pattern"); + + const extractedContent = contentMatch[1].trim(); + Assert.lessOrEqual( + extractedContent.length, + 10003, + "Content should be truncated to ~10000 chars (with ...)" + ); + Assert.ok( + extractedContent.endsWith("..."), + "Truncated content should end with ..." + ); + } finally { + sb.restore(); + } +}); + +add_task(async function test_getPageContent_truncation_at_sentence_boundary() { + const sb = sinon.createSandbox(); + + try { + const targetUrl = "https://example.com/sentences"; + const sentence = "This is a sentence. "; + const longContent = sentence.repeat(600); + + const mockExtractor = { + getText: sinon.stub().resolves(longContent), + getReaderModeContent: sinon.stub().resolves(""), + }; + + const tab = createFakeTab(targetUrl, "Sentences"); + tab.linkedBrowser.browsingContext.currentWindowContext.getActor = sinon + .stub() + .resolves(mockExtractor); + + setupBrowserWindowTracker(sb, createFakeWindow([tab])); + + const result = await GetPageContent.getPageContent( + { url: targetUrl }, + new Set([targetUrl]) + ); + + const contentMatch = result.match(/Content \(full page\) from.*:\s*(.*)/s); + Assert.ok(contentMatch, "Should match content pattern"); + + const extractedContent = contentMatch[1].trim(); + Assert.lessOrEqual( + extractedContent.length, + 10001, + "Should truncate near 10000 chars" + ); + Assert.ok( + extractedContent.endsWith("."), + "Should end at sentence boundary (period)" + ); + Assert.ok( + !extractedContent.endsWith("..."), + "Should not have ... when truncated at sentence" + ); + } finally { + sb.restore(); + } +}); + +add_task(async function test_getPageContent_empty_content() { + const sb = sinon.createSandbox(); + + try { + const targetUrl = "https://example.com/empty"; + + const mockExtractor = { + getText: sinon.stub().resolves(" \n \n "), + getReaderModeContent: sinon.stub().resolves(""), + }; + + const tab = createFakeTab(targetUrl, "Empty Page"); + tab.linkedBrowser.browsingContext.currentWindowContext.getActor = sinon + .stub() + .resolves(mockExtractor); + + setupBrowserWindowTracker(sb, createFakeWindow([tab])); + + const result = await GetPageContent.getPageContent( + { url: targetUrl }, + new Set([targetUrl]) + ); + + // Whitespace content is normalized but still returns success + Assert.ok( + result.includes("Content (full page)"), + "Should use full page mode after reader fallback" + ); + Assert.ok(result.includes("Empty Page"), "Should include tab label"); + // The content is essentially empty after normalization, but still returned + Assert.ok( + result.match(/:\s*$/), + "Content should be mostly empty after normalization" + ); + } finally { + sb.restore(); + } +}); + +add_task(async function test_getPageContent_extraction_error() { + const sb = sinon.createSandbox(); + + try { + const targetUrl = "https://example.com/error"; + + const mockExtractor = { + getText: sinon.stub().rejects(new Error("Extraction failed")), + getReaderModeContent: sinon.stub().resolves(""), + }; + + const tab = createFakeTab(targetUrl, "Error Page"); + tab.linkedBrowser.browsingContext.currentWindowContext.getActor = sinon + .stub() + .resolves(mockExtractor); + + setupBrowserWindowTracker(sb, createFakeWindow([tab])); + + const result = await GetPageContent.getPageContent( + { url: targetUrl }, + new Set([targetUrl]) + ); + + Assert.ok( + result.includes("returned no content"), + "Should handle extraction error gracefully" + ); + Assert.ok(result.includes("Error Page"), "Should include tab label"); + } finally { + sb.restore(); + } +}); + +add_task(async function test_getPageContent_viewport_mode() { + const sb = sinon.createSandbox(); + + try { + const targetUrl = "https://example.com/viewport"; + + const mockExtractor = { + getText: sinon.stub().resolves("Full page content"), + getReaderModeContent: sinon.stub().resolves(""), + }; + + const tab = createFakeTab(targetUrl, "Viewport Test"); + tab.linkedBrowser.browsingContext.currentWindowContext.getActor = sinon + .stub() + .resolves(mockExtractor); + + setupBrowserWindowTracker(sb, createFakeWindow([tab])); + + const result = await GetPageContent.getPageContent( + { url: targetUrl }, + new Set([targetUrl]) + ); + + Assert.ok( + result.includes("Content (full page)"), + "Should use full mode by default" + ); + Assert.ok(result.includes("Full page content"), "Should include content"); + } finally { + sb.restore(); + } +}); + +add_task(async function test_getPageContent_reader_mode_string() { + const sb = sinon.createSandbox(); + + try { + const targetUrl = "https://example.com/reader"; + const readerContent = "Clean reader mode text"; + + const mockExtractor = { + getText: sinon.stub().resolves("Full content"), + getReaderModeContent: sinon.stub().resolves(readerContent), + }; + + const tab = createFakeTab(targetUrl, "Reader Test"); + tab.linkedBrowser.browsingContext.currentWindowContext.getActor = sinon + .stub() + .resolves(mockExtractor); + + setupBrowserWindowTracker(sb, createFakeWindow([tab])); + + const result = await GetPageContent.getPageContent( + { url: targetUrl }, + new Set([targetUrl]) + ); + + Assert.ok( + result.includes("Content (reader mode)"), + "Should use reader mode by default" + ); + Assert.ok( + result.includes(readerContent), + "Should include reader mode content" + ); + } finally { + sb.restore(); + } +}); + +add_task(async function test_getPageContent_no_window() { + const sb = sinon.createSandbox(); + + try { + const targetUrl = "https://example.com"; + setupBrowserWindowTracker(sb, null); + + // Add URL to allowed list so it checks for window instead of trying headless + const result = await GetPageContent.getPageContent( + { url: targetUrl }, + new Set([targetUrl]) + ); + + Assert.ok( + result.includes("Error retrieving content"), + "Should handle null window gracefully" + ); + } finally { + sb.restore(); + } +}); + +add_task(async function test_getPageContent_closed_window() { + const sb = sinon.createSandbox(); + + try { + const targetUrl = "https://example.com"; + const closedWindow = { + closed: true, + gBrowser: { tabs: [] }, + }; + + setupBrowserWindowTracker(sb, closedWindow); + + // Add URL to allowed list so it checks for window instead of trying headless + const result = await GetPageContent.getPageContent( + { url: targetUrl }, + new Set([targetUrl]) + ); + + Assert.ok( + result.includes("Error retrieving content") || + result.includes("Cannot find URL"), + "Should handle closed window with error" + ); + } finally { + sb.restore(); + } +}); + +add_task(async function test_getPageContent_window_without_gBrowser() { + const sb = sinon.createSandbox(); + + try { + const targetUrl = "https://example.com"; + const windowWithoutGBrowser = { + closed: false, + gBrowser: null, + }; + + setupBrowserWindowTracker(sb, windowWithoutGBrowser); + + // Add URL to allowed list so it checks for window instead of trying headless + const result = await GetPageContent.getPageContent( + { url: targetUrl }, + new Set([targetUrl]) + ); + + Assert.ok( + result.includes("Error retrieving content"), + "Should handle window without gBrowser" + ); + } finally { + sb.restore(); + } +}); + +add_task(async function test_getPageContent_whitespace_normalization() { + const sb = sinon.createSandbox(); + + try { + const targetUrl = "https://example.com/whitespace"; + const messyContent = + "Text with lots\n\n\nof whitespace\n\n\n\nhere"; + + const mockExtractor = { + getText: sinon.stub().resolves(messyContent), + getReaderModeContent: sinon.stub().resolves(""), + }; + + const tab = createFakeTab(targetUrl, "Whitespace Test"); + tab.linkedBrowser.browsingContext.currentWindowContext.getActor = sinon + .stub() + .resolves(mockExtractor); + + setupBrowserWindowTracker(sb, createFakeWindow([tab])); + + const result = await GetPageContent.getPageContent( + { url: targetUrl }, + new Set([targetUrl]) + ); + + Assert.ok( + result.includes("Text with lots of whitespace here"), + "Should normalize whitespace" + ); + Assert.ok( + !result.includes(" "), + "Should not have multiple consecutive spaces" + ); + } finally { + sb.restore(); + } +}); + +add_task(async function test_getPageContent_invalid_url_format() { + const sb = sinon.createSandbox(); + + try { + const targetUrl = "not-a-valid-url"; + const tabs = [createFakeTab("https://example.com", "Example")]; + + setupBrowserWindowTracker(sb, createFakeWindow(tabs)); + + // Add URL to allowed list so it searches tabs instead of trying headless + const result = await GetPageContent.getPageContent( + { url: targetUrl }, + new Set([targetUrl]) + ); + + Assert.ok( + result.includes("Cannot find URL"), + "Should handle invalid URL format" + ); + } finally { + sb.restore(); + } +}); + +add_task(async function test_getPageContent_extraction_returns_string() { + const sb = sinon.createSandbox(); + + try { + const targetUrl = "https://example.com/string"; + const directString = "Direct string content"; + + const mockExtractor = { + getText: sinon.stub().resolves(directString), + getReaderModeContent: sinon.stub().resolves(""), + }; + + const tab = createFakeTab(targetUrl, "String Test"); + tab.linkedBrowser.browsingContext.currentWindowContext.getActor = sinon + .stub() + .resolves(mockExtractor); + + setupBrowserWindowTracker(sb, createFakeWindow([tab])); + + const result = await GetPageContent.getPageContent( + { url: targetUrl }, + new Set([targetUrl]) + ); + + Assert.ok( + result.includes(directString), + "Should handle extraction returning string directly" + ); + } finally { + sb.restore(); + } +}); + +add_task(async function test_getPageContent_extraction_returns_object() { + const sb = sinon.createSandbox(); + + try { + const targetUrl = "https://example.com/object"; + // The API now expects strings, not objects + // If getText returns a non-string object, it should be treated as no content + const objectContent = { text: "Object text content" }; + + const mockExtractor = { + getText: sinon.stub().resolves(objectContent), + getReaderModeContent: sinon.stub().resolves(""), + }; + + const tab = createFakeTab(targetUrl, "Object Test"); + tab.linkedBrowser.browsingContext.currentWindowContext.getActor = sinon + .stub() + .resolves(mockExtractor); + + setupBrowserWindowTracker(sb, createFakeWindow([tab])); + + const result = await GetPageContent.getPageContent( + { url: targetUrl }, + new Set([targetUrl]) + ); + + // API expects strings now, objects are treated as no content + Assert.ok( + result.includes("returned no content"), + "Should treat object return value as no content" + ); + } finally { + sb.restore(); + } +}); + +add_task( + async function test_getPageContent_extraction_returns_non_string_text() { + const sb = sinon.createSandbox(); + + try { + const targetUrl = "https://example.com/nonstring"; + + const mockExtractor = { + getText: sinon.stub().resolves(12345), + getReaderModeContent: sinon.stub().resolves(""), + }; + + const tab = createFakeTab(targetUrl, "Non-string Test"); + tab.linkedBrowser.browsingContext.currentWindowContext.getActor = sinon + .stub() + .resolves(mockExtractor); + + setupBrowserWindowTracker(sb, createFakeWindow([tab])); + + const result = await GetPageContent.getPageContent( + { url: targetUrl }, + new Set([targetUrl]) + ); + + Assert.ok( + result.includes("returned no content"), + "Should handle non-string text property as empty" + ); + } finally { + sb.restore(); + } + } +); + +add_task(async function test_getPageContent_allowed_urls_set() { + const sb = sinon.createSandbox(); + + try { + const targetUrl = "https://allowed.com/page"; + const tabs = [createFakeTab("https://other.com", "Other")]; + + setupBrowserWindowTracker(sb, createFakeWindow(tabs)); + + const allowedUrls = new Set([ + "https://allowed.com/page", + "https://another-allowed.com", + ]); + + const result = await GetPageContent.getPageContent( + { url: targetUrl }, + allowedUrls + ); + + // Headless extraction doesn't work in xpcshell environment + Assert.ok( + result.includes("Cannot find URL"), + "Should return error when tab not found (headless doesn't work in xpcshell)" + ); + } finally { + sb.restore(); + } +}); + +add_task(async function test_getPageContent_available_tabs_list() { + const sb = sinon.createSandbox(); + + try { + const targetUrl = "https://notfound.com"; + const tabs = [ + createFakeTab("https://first.com", "First Tab"), + createFakeTab("https://second.com", "Second Tab"), + createFakeTab("https://third.com", "Third Tab"), + createFakeTab("https://fourth.com", "Fourth Tab"), + ]; + + setupBrowserWindowTracker(sb, createFakeWindow(tabs)); + + // Add the URL to allowed list so it searches tabs instead of trying headless + const result = await GetPageContent.getPageContent( + { url: targetUrl }, + new Set([targetUrl]) + ); + + // URL is in allowed list but not open, so should get error + Assert.ok( + result.includes("Cannot find URL"), + "Should return error when tab not found" + ); + Assert.ok( + result.includes(targetUrl), + "Should include requested URL in error" + ); + } finally { + sb.restore(); + } +}); diff --git a/browser/components/aiwindow/models/tests/xpcshell/xpcshell.toml b/browser/components/aiwindow/models/tests/xpcshell/xpcshell.toml @@ -30,6 +30,8 @@ support-files = [] ["test_Tools_GetOpenTabs.js"] +["test_Tools_GetPageContent.js"] + ["test_Tools_SearchBrowsingHistory.js"] ["test_Utils.js"]