commit 738780f1f83bcfe282d30051b08c666d5b778353
parent 5ea2407af277878756eef5539935066c443bf35c
Author: Tom Zhang <tzhang@mozilla.com>
Date: Thu, 18 Dec 2025 17:11:19 +0000
Bug 2004843 - Implement get_page_content tool r=gregtatum,ai-models-reviewers,tburrell
Differential Revision: https://phabricator.services.mozilla.com/D276302
Diffstat:
7 files changed, 1228 insertions(+), 16 deletions(-)
diff --git a/browser/components/aiwindow/models/Tools.sys.mjs b/browser/components/aiwindow/models/Tools.sys.mjs
@@ -5,10 +5,11 @@
*/
/**
- * This file contains LLM tool abscrations and tool definitions.
+ * This file contains LLM tool abstractions and tool definitions.
*/
import { searchBrowsingHistory as implSearchBrowsingHistory } from "moz-src:///browser/components/aiwindow/models/SearchBrowsingHistory.sys.mjs";
+import { PageExtractorParent } from "resource://gre/actors/PageExtractorParent.sys.mjs";
const lazy = {};
ChromeUtils.defineESModuleGetters(lazy, {
@@ -19,18 +20,19 @@ ChromeUtils.defineESModuleGetters(lazy, {
const GET_OPEN_TABS = "get_open_tabs";
const SEARCH_BROWSING_HISTORY = "search_browsing_history";
+const GET_PAGE_CONTENT = "get_page_content";
-// eslint-disable-next-line no-unused-vars
-const TOOLS = [GET_OPEN_TABS, SEARCH_BROWSING_HISTORY];
+export const TOOLS = [GET_OPEN_TABS, SEARCH_BROWSING_HISTORY, GET_PAGE_CONTENT];
-// eslint-disable-next-line no-unused-vars
-const toolsConfig = [
+export const toolsConfig = [
{
type: "function",
function: {
name: GET_OPEN_TABS,
description:
- "Access the user's browser and return a list of most recently browsed tabs. Each tab is represented by a JSON with the page's url, title and description if available. Default to return maximum 15 tabs.",
+ "Access the user's browser and return a list of most recently browsed tabs. " +
+ "Each tab is represented by a JSON with the page's url, title and description " +
+ "if available. Default to return maximum 15 tabs.",
parameters: {
type: "object",
properties: {},
@@ -42,25 +44,39 @@ const toolsConfig = [
function: {
name: SEARCH_BROWSING_HISTORY,
description:
- 'Refind pages from the user\'s PAST BROWSING HISTORY. Use this whenever the user wants to recall, review, list, or see pages they visited earlier (for a topic, site, or time period). Also use this when the user requests all pages from a past time period (e.g., "yesterday", "last week"), even if no topic is specified. Do NOT use for open tabs, completely general web questions, or abstract questions about "history" or habits.',
+ "Refind pages from the user's PAST BROWSING HISTORY. Use this whenever the " +
+ "user wants to recall, review, list, or see pages they visited earlier (for a " +
+ "topic, site, or time period). Also use this when the user requests all pages " +
+ 'from a past time period (e.g., "yesterday", "last week"), even if no topic is ' +
+ "specified. Do NOT use for open tabs, completely general web questions, or " +
+ 'abstract questions about "history" or habits.',
parameters: {
type: "object",
properties: {
searchTerm: {
type: "string",
description:
- "A detailed, noun-heavy phrase (~2-12 meaningful tokens) summarizing the user's intent for semantic retrieval. Include the main entity/topic plus 1-3 contextual qualifiers (e.g., library name, purpose, site, or timeframe). Avoid vague or single-word queries.",
+ "A detailed, noun-heavy phrase (~2-12 meaningful tokens) summarizing " +
+ "the user's intent for semantic retrieval. Include the main entity/topic " +
+ "plus 1-3 contextual qualifiers (e.g., library name, purpose, site, or " +
+ "timeframe). Avoid vague or single-word queries.",
},
startTs: {
type: "string",
description:
- "Inclusive lower bound of the time window as an ISO 8601 datetime string (e.g., '2025-11-07T09:00:00-05:00'). Use when the user asks for results within a time or range start, such as 'last week', 'since yesterday', or 'last night'. This must be before the user's current datetime.",
+ "Inclusive lower bound of the time window as an ISO 8601 datetime string " +
+ "(e.g., '2025-11-07T09:00:00-05:00'). Use when the user asks for results " +
+ "within a time or range start, such as 'last week', 'since yesterday', or" +
+ "'last night'. This must be before the user's current datetime.",
default: null,
},
endTs: {
type: "string",
description:
- "Inclusive upper bound of the time window as an ISO 8601 datetime string (e.g., '2025-11-07T21:00:00-05:00'). Use when the user asks for results within a time or range end, such as 'last week', 'between 2025-10-01 and 2025-10-31', or 'before Monday'. This must be before the user's current datetime.",
+ "Inclusive upper bound of the time window as an ISO 8601 datetime string " +
+ "(e.g., '2025-11-07T21:00:00-05:00'). Use when the user asks for results " +
+ "within a time or range end, such as 'last week', 'between 2025-10-01 and " +
+ "2025-10-31', or 'before Monday'. This must be before the user's current datetime.",
default: null,
},
},
@@ -68,6 +84,26 @@ const toolsConfig = [
},
},
},
+ {
+ type: "function",
+ function: {
+ name: GET_PAGE_CONTENT,
+ description:
+ "Retrieve cleaned text content of the provided browser page URL.",
+ parameters: {
+ properties: {
+ url: {
+ type: "string",
+ description:
+ "The complete URL of the page to fetch content from. This must exactly match " +
+ "a URL from the current conversation context. Use the full URL including " +
+ "protocol (http/https). Example: 'https://www.example.com/article'.",
+ },
+ },
+ required: ["url"],
+ },
+ },
+ },
];
/**
@@ -126,7 +162,7 @@ export async function getOpenTabs(n = 15) {
}
/**
- * Tool entrypoint for browsing history search.
+ * Tool entrypoint for search_browsing_history.
*
* Parameters (defaults shown):
* - searchTerm: "" - string used for search
@@ -136,16 +172,16 @@ export async function getOpenTabs(n = 15) {
*
* Detailed behavior and implementation are in SearchBrowsingHistory.sys.mjs.
*
- * @param {object} params
+ * @param {object} toolParams
* The search parameters.
- * @param {string} params.searchTerm
+ * @param {string} toolParams.searchTerm
* The search string. If null or empty, semantic search is skipped and
* results are filtered by time range and sorted by last_visit_date and frecency.
- * @param {string|null} params.startTs
+ * @param {string|null} toolParams.startTs
* Optional ISO-8601 start timestamp (e.g. "2025-11-07T09:00:00-05:00").
- * @param {string|null} params.endTs
+ * @param {string|null} toolParams.endTs
* Optional ISO-8601 end timestamp (e.g. "2025-11-07T09:00:00-05:00").
- * @param {number} params.historyLimit
+ * @param {number} toolParams.historyLimit
* Maximum number of history results to return.
* @returns {Promise<object>}
* A promise resolving to an object with the search term and history results.
@@ -200,3 +236,219 @@ export function stripSearchBrowsingHistoryFields(result) {
return result;
}
}
+
+/**
+ * Class for handling page content extraction with configurable modes and limits.
+ */
+export class GetPageContent {
+ static DEFAULT_MODE = "reader";
+ static FALLBACK_MODE = "full";
+ static MAX_CHARACTERS = 10000;
+
+ static MODE_HANDLERS = {
+ viewport: async pageExtractor => {
+ const result = await pageExtractor.getText({ justViewport: true });
+ return { text: result.text };
+ },
+ reader: async pageExtractor => {
+ const text = await pageExtractor.getReaderModeContent();
+ return { text: typeof text === "string" ? text : "" };
+ },
+ full: async pageExtractor => {
+ const result = await pageExtractor.getText();
+ return { text: result };
+ },
+ };
+
+ /**
+ * Tool entrypoint for get_page_content.
+ *
+ * @param {object} toolParams
+ * @param {string} toolParams.url
+ * @param {Set<string>} allowedUrls
+ * @returns {Promise<string>}
+ * A promise resolving to a string containing the extracted page content
+ * with a descriptive header, or an error message if extraction fails.
+ */
+ static async getPageContent({ url }, allowedUrls) {
+ try {
+ // Search through the allowed URLs and extract directly if exists
+ if (!allowedUrls.has(url)) {
+ // Bug 2006418 - This will load the page headlessly, and then extract the content.
+ // It might be a better idea to have the lifetime of the page be tied to the chat
+ // while it's open, and with a "keep alive" timeout. For now it's simpler to just
+ // load the page fresh every time.
+ return PageExtractorParent.getHeadlessExtractor(url, pageExtractor =>
+ this.#runExtraction(pageExtractor, this.DEFAULT_MODE, url)
+ );
+ }
+
+ // TODO: figure out what windows we can access to give permission here, and update this API
+ let win = lazy.BrowserWindowTracker.getTopWindow();
+ let gBrowser = win.gBrowser;
+ let tabs = gBrowser.tabs;
+
+ // Find the tab with the matching URL in browser
+ let targetTab = null;
+ for (let i = 0; i < tabs.length; i++) {
+ const tab = tabs[i];
+ const currentURI = tab?.linkedBrowser?.currentURI;
+ if (currentURI?.spec === url) {
+ targetTab = tab;
+ break;
+ }
+ }
+
+ // If no match, try hostname matching for cases where protocols differ
+ if (!targetTab) {
+ try {
+ const inputHostPort = new URL(url).host;
+ targetTab = tabs.find(tab => {
+ try {
+ const tabHostPort = tab.linkedBrowser.currentURI.hostPort;
+ return tabHostPort === inputHostPort;
+ } catch {
+ return false;
+ }
+ });
+ } catch {
+ // Invalid URL, continue with original logic
+ }
+ }
+
+ // If still no match, abort
+ if (!targetTab) {
+ return `Cannot find URL: ${url}, page content extraction failed.`;
+ }
+
+ // Attempt extraction
+ const currentWindowContext =
+ targetTab.linkedBrowser.browsingContext?.currentWindowContext;
+
+ if (!currentWindowContext) {
+ return `Cannot access content from "${targetTab.label}" at ${url}.`;
+ // Stripped message "The tab may still be loading or is not accessible." to not confuse the LLM
+ }
+
+ // Extract page content using PageExtractor
+ const pageExtractor =
+ await currentWindowContext.getActor("PageExtractor");
+
+ return this.#runExtraction(
+ pageExtractor,
+ this.DEFAULT_MODE,
+ `"${targetTab.label}" (${url})`
+ );
+ } catch (error) {
+ // Bug 2006425 - Decide on the strategy for error handling in tool calls
+ // i.e., will the LLM keep retrying get_page_content due to error?
+ console.error(error);
+ return `Error retrieving content from ${url}.`;
+ // Stripped ${error.message} content to not confruse the LLM
+ }
+ }
+
+ /**
+ * Main extraction function.
+ * label is of form `{tab.title} ({tab.url})`.
+ *
+ * @param {PageExtractor} pageExtractor
+ * @param {string} mode
+ * @param {string} label
+ * @returns {Promise<string>}
+ * A promise resolving to a formatted string containing the page content
+ * with mode and label information, or an error message if no content is available.
+ */
+ static async #runExtraction(pageExtractor, mode, label) {
+ const selectedMode =
+ typeof mode === "string" && this.MODE_HANDLERS[mode]
+ ? mode
+ : this.DEFAULT_MODE;
+ const handler = this.MODE_HANDLERS[selectedMode];
+ let extraction = null;
+
+ try {
+ extraction = await handler(pageExtractor);
+ } catch (err) {
+ console.error(
+ "[SmartWindow] get_page_content mode failed",
+ selectedMode,
+ err
+ );
+ }
+
+ let pageContent = "";
+ if (typeof extraction === "string") {
+ pageContent = extraction;
+ } else if (typeof extraction?.text === "string") {
+ pageContent = extraction.text;
+ }
+
+ // Track which mode was actually used (in case we fall back)
+ let actualMode = selectedMode;
+
+ // If reader mode returns no content, fall back to full mode
+ if (!pageContent && selectedMode === "reader") {
+ try {
+ const fallbackHandler = this.MODE_HANDLERS[this.FALLBACK_MODE];
+ extraction = await fallbackHandler(pageExtractor);
+ if (typeof extraction === "string") {
+ pageContent = extraction;
+ } else if (typeof extraction?.text === "string") {
+ pageContent = extraction.text;
+ }
+ if (pageContent) {
+ actualMode = this.FALLBACK_MODE;
+ }
+ } catch (err) {
+ console.error(
+ "[SmartWindow] get_page_content fallback mode failed",
+ this.FALLBACK_MODE,
+ err
+ );
+ }
+ }
+
+ if (!pageContent) {
+ return `get_page_content(${selectedMode}) returned no content for ${label}.`;
+ // Stripped message "Try another mode if you still need information." to not confruse the LLM
+ }
+
+ // Clean and truncate content for better LLM consumption
+ // Bug 2006436 - Consider doing this directly in pageExtractor if absolutely needed.
+ let cleanContent = pageContent
+ .replace(/\s+/g, " ") // Normalize whitespace
+ .replace(/\n\s*\n/g, "\n") // Clean up line breaks
+ .trim();
+
+ // Limit content length but be more generous for LLM processing
+ // Bug 1995043 - once reader mode has length truncation,
+ // we can remove this and directly do this in pageExtractor.
+ if (cleanContent.length > this.MAX_CHARACTERS) {
+ // Try to cut at a sentence boundary
+ const truncatePoint = cleanContent.lastIndexOf(".", this.MAX_CHARACTERS);
+ if (truncatePoint > this.MAX_CHARACTERS - 100) {
+ cleanContent = cleanContent.substring(0, truncatePoint + 1);
+ } else {
+ cleanContent = cleanContent.substring(0, this.MAX_CHARACTERS) + "...";
+ }
+ }
+
+ let modeLabel;
+ switch (actualMode) {
+ case "viewport":
+ modeLabel = "current viewport";
+ break;
+ case "reader":
+ modeLabel = "reader mode";
+ break;
+ case "full":
+ modeLabel = "full page";
+ break;
+ }
+
+ return `Content (${modeLabel}) from ${label}:
+
+${cleanContent}`;
+ }
+}
diff --git a/browser/components/aiwindow/models/moz.build b/browser/components/aiwindow/models/moz.build
@@ -9,6 +9,10 @@ DIRS += [
"prompts",
]
+BROWSER_CHROME_MANIFESTS += [
+ "tests/browser/browser.toml",
+]
+
MOZ_SRC_FILES += [
"Chat.sys.mjs",
"ChatUtils.sys.mjs",
diff --git a/browser/components/aiwindow/models/tests/browser/browser.toml b/browser/components/aiwindow/models/tests/browser/browser.toml
@@ -0,0 +1,6 @@
+[DEFAULT]
+support-files = [
+ "head.js",
+]
+
+["browser_get_page_content.js"]
diff --git a/browser/components/aiwindow/models/tests/browser/browser_get_page_content.js b/browser/components/aiwindow/models/tests/browser/browser_get_page_content.js
@@ -0,0 +1,58 @@
+/* Any copyright is dedicated to the Public Domain.
+ http://creativecommons.org/publicdomain/zero/1.0/ */
+
+"use strict";
+
+/**
+ * Test that the get_page_content tool call can extract content from a page.
+ */
+add_task(async function test_get_page_content_basic() {
+ const html = `
+ <!DOCTYPE html>
+ <html>
+ <head>
+ <meta charset="utf-8">
+ <title>Test Page</title>
+ </head>
+ <body>
+ <article>
+ <h1>Sample Article Title</h1>
+ <p>This is the first paragraph with some sample content.</p>
+ <p>This is the second paragraph with additional information.</p>
+ </article>
+ </body>
+ </html>
+ `;
+
+ const { url, GetPageContent, cleanup } = await setupGetPageContentTest(html);
+
+ // Create an allowed URLs set containing the test page
+ const allowedUrls = new Set([url]);
+
+ // Call the tool with the URL
+ const result = await GetPageContent.getPageContent({ url }, allowedUrls);
+
+ info("Extraction result: " + result);
+
+ // Verify the result contains expected content
+ ok(
+ result.includes("Sample Article Title"),
+ "Result should contain the title"
+ );
+ ok(
+ result.includes("first paragraph"),
+ "Result should contain text from the first paragraph"
+ );
+ ok(
+ result.includes("second paragraph"),
+ "Result should contain text from the second paragraph"
+ );
+
+ // Verify the result indicates which extraction mode was used
+ ok(
+ result.startsWith("Content (") && result.includes(") from"),
+ "Result should indicate the extraction mode used"
+ );
+
+ await cleanup();
+});
diff --git a/browser/components/aiwindow/models/tests/browser/head.js b/browser/components/aiwindow/models/tests/browser/head.js
@@ -0,0 +1,71 @@
+/* Any copyright is dedicated to the Public Domain.
+ http://creativecommons.org/publicdomain/zero/1.0/ */
+
+"use strict";
+
+/**
+ * Start an HTTP server that serves HTML content.
+ *
+ * @param {string} html - The HTML content to serve
+ * @returns {object} An object containing:
+ * - url: The URL where the content is served
+ * - serverClosed: Promise that resolves when the server stops
+ */
+function serveHTML(html) {
+ const { HttpServer } = ChromeUtils.importESModule(
+ "resource://testing-common/httpd.sys.mjs"
+ );
+
+ const server = new HttpServer();
+
+ server.registerPathHandler("/test-page.html", (_request, response) => {
+ response.setHeader("Content-Type", "text/html");
+ response.write(html);
+ });
+
+ server.start(-1);
+
+ const { primaryHost, primaryPort } = server.identity;
+ // eslint-disable-next-line @microsoft/sdl/no-insecure-url
+ const url = `http://${primaryHost}:${primaryPort}/test-page.html`;
+
+ return {
+ url,
+ server,
+ };
+}
+
+/**
+ * Set up a test for the get_page_content tool call by serving HTML and loading it.
+ *
+ * @param {string} html - The HTML content to serve and test
+ * @returns {Promise<object>} An object containing:
+ * - tab: The opened browser tab
+ * - url: The URL of the loaded page
+ * - GetPageContent: The GetPageContent class
+ * - cleanup: Function to clean up the test
+ */
+async function setupGetPageContentTest(html) {
+ const { GetPageContent } = ChromeUtils.importESModule(
+ "moz-src:///browser/components/aiwindow/models/Tools.sys.mjs"
+ );
+
+ const { url, server } = serveHTML(html);
+
+ const tab = await BrowserTestUtils.openNewForegroundTab(
+ gBrowser,
+ url,
+ true // waitForLoad
+ );
+
+ return {
+ tab,
+ url,
+ GetPageContent,
+ async cleanup() {
+ info("Cleaning up test");
+ BrowserTestUtils.removeTab(tab);
+ await new Promise(resolve => server.stop(resolve));
+ },
+ };
+}
diff --git a/browser/components/aiwindow/models/tests/xpcshell/test_Tools_GetPageContent.js b/browser/components/aiwindow/models/tests/xpcshell/test_Tools_GetPageContent.js
@@ -0,0 +1,819 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+const { GetPageContent } = ChromeUtils.importESModule(
+ "moz-src:///browser/components/aiwindow/models/Tools.sys.mjs"
+);
+
+const { sinon } = ChromeUtils.importESModule(
+ "resource://testing-common/Sinon.sys.mjs"
+);
+
+function createFakeBrowser(url, hasBrowsingContext = true) {
+ const parsedUrl = new URL(url);
+ const browser = {
+ currentURI: {
+ spec: url,
+ hostPort: parsedUrl.host,
+ },
+ };
+
+ if (hasBrowsingContext) {
+ browser.browsingContext = {
+ currentWindowContext: {
+ getActor: sinon.stub().resolves({
+ getText: sinon.stub().resolves("Sample page content"),
+ getReaderModeContent: sinon.stub().resolves(""),
+ }),
+ },
+ };
+ } else {
+ browser.browsingContext = null;
+ }
+
+ return browser;
+}
+
+function createFakeTab(url, title, hasBrowsingContext = true) {
+ return {
+ linkedBrowser: createFakeBrowser(url, hasBrowsingContext),
+ label: title,
+ };
+}
+
+function createFakeWindow(tabs) {
+ return {
+ closed: false,
+ gBrowser: {
+ tabs,
+ },
+ };
+}
+
+function setupBrowserWindowTracker(sandbox, window) {
+ const BrowserWindowTracker = ChromeUtils.importESModule(
+ "resource:///modules/BrowserWindowTracker.sys.mjs"
+ ).BrowserWindowTracker;
+
+ sandbox.stub(BrowserWindowTracker, "getTopWindow").returns(window);
+}
+
+add_task(async function test_getPageContent_exact_url_match() {
+ const sb = sinon.createSandbox();
+
+ try {
+ const targetUrl = "https://example.com/page";
+ const tabs = [
+ createFakeTab("https://other.com", "Other"),
+ createFakeTab(targetUrl, "Example Page"),
+ ];
+
+ setupBrowserWindowTracker(sb, createFakeWindow(tabs));
+
+ const result = await GetPageContent.getPageContent(
+ { url: targetUrl },
+ new Set([targetUrl])
+ );
+
+ Assert.ok(result.includes("Example Page"), "Should include page title");
+ Assert.ok(
+ result.includes("Sample page content"),
+ "Should include page content"
+ );
+ Assert.ok(
+ result.includes(targetUrl),
+ "Should include URL in result message"
+ );
+ } finally {
+ sb.restore();
+ }
+});
+
+add_task(async function test_getPageContent_normalized_url_match() {
+ const sb = sinon.createSandbox();
+
+ try {
+ const tabs = [
+ createFakeTab("https://example.com/page/", "Example Page"),
+ createFakeTab("https://other.com", "Other"),
+ ];
+
+ setupBrowserWindowTracker(sb, createFakeWindow(tabs));
+
+ const result = await GetPageContent.getPageContent(
+ { url: "https://example.com/page" },
+ new Set(["https://example.com/page"])
+ );
+
+ Assert.ok(
+ result.includes("Example Page"),
+ "Should match URL after normalizing trailing slashes"
+ );
+ Assert.ok(
+ result.includes("Sample page content"),
+ "Should include page content"
+ );
+ } finally {
+ sb.restore();
+ }
+});
+
+add_task(async function test_getPageContent_hostname_match() {
+ const sb = sinon.createSandbox();
+
+ try {
+ const tabs = [
+ createFakeTab("https://example.com/page", "Example Page"),
+ createFakeTab("https://other.com", "Other"),
+ ];
+
+ setupBrowserWindowTracker(sb, createFakeWindow(tabs));
+
+ const result = await GetPageContent.getPageContent(
+ { url: "http://example.com/different" },
+ new Set(["http://example.com/different"])
+ );
+
+ Assert.ok(
+ result.includes("Example Page"),
+ "Should match by hostname when exact match fails"
+ );
+ Assert.ok(
+ result.includes("Sample page content"),
+ "Should include page content"
+ );
+ } finally {
+ sb.restore();
+ }
+});
+
+add_task(async function test_getPageContent_tab_not_found_with_allowed_url() {
+ const sb = sinon.createSandbox();
+
+ try {
+ const targetUrl = "https://external.com/article";
+ const tabs = [
+ createFakeTab("https://example.com", "Example"),
+ createFakeTab("https://other.com", "Other"),
+ ];
+
+ setupBrowserWindowTracker(sb, createFakeWindow(tabs));
+
+ const allowedUrls = new Set([targetUrl]);
+ const result = await GetPageContent.getPageContent(
+ { url: targetUrl },
+ allowedUrls
+ );
+
+ // Headless extraction doesn't work in xpcshell environment
+ // In real usage, this would attempt headless extraction for allowed URLs
+ Assert.ok(
+ result.includes("Cannot find URL"),
+ "Should return error when tab not found (headless doesn't work in xpcshell)"
+ );
+ Assert.ok(result.includes(targetUrl), "Should include target URL in error");
+ } finally {
+ sb.restore();
+ }
+});
+
+add_task(
+ async function test_getPageContent_tab_not_found_without_allowed_url() {
+ const sb = sinon.createSandbox();
+
+ try {
+ const targetUrl = "https://notfound.com/page";
+ const tabs = [
+ createFakeTab("https://example.com", "Example"),
+ createFakeTab("https://other.com", "Other"),
+ createFakeTab("https://third.com", "Third"),
+ createFakeTab("https://fourth.com", "Fourth"),
+ ];
+
+ setupBrowserWindowTracker(sb, createFakeWindow(tabs));
+
+ const allowedUrls = new Set(["https://different.com"]);
+
+ // When URL is not in allowedUrls, it attempts headless extraction
+ // This doesn't work in xpcshell, so we expect an error
+ let errorThrown = false;
+ try {
+ await GetPageContent.getPageContent({ url: targetUrl }, allowedUrls);
+ } catch (error) {
+ errorThrown = true;
+ Assert.ok(
+ error.message.includes("addProgressListener"),
+ "Should fail with headless browser error in xpcshell"
+ );
+ }
+
+ Assert.ok(
+ errorThrown,
+ "Should throw error when attempting headless extraction in xpcshell"
+ );
+ } finally {
+ sb.restore();
+ }
+ }
+);
+
+add_task(async function test_getPageContent_no_browsing_context() {
+ const sb = sinon.createSandbox();
+
+ try {
+ const targetUrl = "https://example.com/loading";
+ const tabs = [createFakeTab(targetUrl, "Loading Page", false)];
+
+ setupBrowserWindowTracker(sb, createFakeWindow(tabs));
+
+ const result = await GetPageContent.getPageContent(
+ { url: targetUrl },
+ new Set([targetUrl])
+ );
+
+ Assert.ok(
+ result.includes("Cannot access content"),
+ "Should return error for unavailable browsing context"
+ );
+ Assert.ok(
+ result.includes("Loading Page"),
+ "Should include tab label in error"
+ );
+ Assert.ok(
+ result.includes(targetUrl),
+ "Should include URL in error message"
+ );
+ } finally {
+ sb.restore();
+ }
+});
+
+add_task(async function test_getPageContent_successful_extraction() {
+ const sb = sinon.createSandbox();
+
+ try {
+ const targetUrl = "https://example.com/article";
+ const pageContent = "This is a well-written article with lots of content.";
+
+ const mockExtractor = {
+ getText: sinon.stub().resolves(pageContent),
+ getReaderModeContent: sinon.stub().resolves(""),
+ };
+
+ const tab = createFakeTab(targetUrl, "Article");
+ tab.linkedBrowser.browsingContext.currentWindowContext.getActor = sinon
+ .stub()
+ .resolves(mockExtractor);
+
+ setupBrowserWindowTracker(sb, createFakeWindow([tab]));
+
+ const result = await GetPageContent.getPageContent(
+ { url: targetUrl },
+ new Set([targetUrl])
+ );
+
+ Assert.ok(result.includes("Content (full page)"), "Should indicate mode");
+ Assert.ok(result.includes("Article"), "Should include tab title");
+ Assert.ok(result.includes(targetUrl), "Should include URL");
+ Assert.ok(result.includes(pageContent), "Should include extracted content");
+ } finally {
+ sb.restore();
+ }
+});
+
+add_task(async function test_getPageContent_content_truncation() {
+ const sb = sinon.createSandbox();
+
+ try {
+ const targetUrl = "https://example.com/long";
+ const longContent = "A".repeat(15000);
+
+ const mockExtractor = {
+ getText: sinon.stub().resolves(longContent),
+ getReaderModeContent: sinon.stub().resolves(""),
+ };
+
+ const tab = createFakeTab(targetUrl, "Long Page");
+ tab.linkedBrowser.browsingContext.currentWindowContext.getActor = sinon
+ .stub()
+ .resolves(mockExtractor);
+
+ setupBrowserWindowTracker(sb, createFakeWindow([tab]));
+
+ const result = await GetPageContent.getPageContent(
+ { url: targetUrl },
+ new Set([targetUrl])
+ );
+
+ const contentMatch = result.match(/Content \(full page\) from.*:\s*(.*)/s);
+ Assert.ok(contentMatch, "Should match content pattern");
+
+ const extractedContent = contentMatch[1].trim();
+ Assert.lessOrEqual(
+ extractedContent.length,
+ 10003,
+ "Content should be truncated to ~10000 chars (with ...)"
+ );
+ Assert.ok(
+ extractedContent.endsWith("..."),
+ "Truncated content should end with ..."
+ );
+ } finally {
+ sb.restore();
+ }
+});
+
+add_task(async function test_getPageContent_truncation_at_sentence_boundary() {
+ const sb = sinon.createSandbox();
+
+ try {
+ const targetUrl = "https://example.com/sentences";
+ const sentence = "This is a sentence. ";
+ const longContent = sentence.repeat(600);
+
+ const mockExtractor = {
+ getText: sinon.stub().resolves(longContent),
+ getReaderModeContent: sinon.stub().resolves(""),
+ };
+
+ const tab = createFakeTab(targetUrl, "Sentences");
+ tab.linkedBrowser.browsingContext.currentWindowContext.getActor = sinon
+ .stub()
+ .resolves(mockExtractor);
+
+ setupBrowserWindowTracker(sb, createFakeWindow([tab]));
+
+ const result = await GetPageContent.getPageContent(
+ { url: targetUrl },
+ new Set([targetUrl])
+ );
+
+ const contentMatch = result.match(/Content \(full page\) from.*:\s*(.*)/s);
+ Assert.ok(contentMatch, "Should match content pattern");
+
+ const extractedContent = contentMatch[1].trim();
+ Assert.lessOrEqual(
+ extractedContent.length,
+ 10001,
+ "Should truncate near 10000 chars"
+ );
+ Assert.ok(
+ extractedContent.endsWith("."),
+ "Should end at sentence boundary (period)"
+ );
+ Assert.ok(
+ !extractedContent.endsWith("..."),
+ "Should not have ... when truncated at sentence"
+ );
+ } finally {
+ sb.restore();
+ }
+});
+
+add_task(async function test_getPageContent_empty_content() {
+ const sb = sinon.createSandbox();
+
+ try {
+ const targetUrl = "https://example.com/empty";
+
+ const mockExtractor = {
+ getText: sinon.stub().resolves(" \n \n "),
+ getReaderModeContent: sinon.stub().resolves(""),
+ };
+
+ const tab = createFakeTab(targetUrl, "Empty Page");
+ tab.linkedBrowser.browsingContext.currentWindowContext.getActor = sinon
+ .stub()
+ .resolves(mockExtractor);
+
+ setupBrowserWindowTracker(sb, createFakeWindow([tab]));
+
+ const result = await GetPageContent.getPageContent(
+ { url: targetUrl },
+ new Set([targetUrl])
+ );
+
+ // Whitespace content is normalized but still returns success
+ Assert.ok(
+ result.includes("Content (full page)"),
+ "Should use full page mode after reader fallback"
+ );
+ Assert.ok(result.includes("Empty Page"), "Should include tab label");
+ // The content is essentially empty after normalization, but still returned
+ Assert.ok(
+ result.match(/:\s*$/),
+ "Content should be mostly empty after normalization"
+ );
+ } finally {
+ sb.restore();
+ }
+});
+
+add_task(async function test_getPageContent_extraction_error() {
+ const sb = sinon.createSandbox();
+
+ try {
+ const targetUrl = "https://example.com/error";
+
+ const mockExtractor = {
+ getText: sinon.stub().rejects(new Error("Extraction failed")),
+ getReaderModeContent: sinon.stub().resolves(""),
+ };
+
+ const tab = createFakeTab(targetUrl, "Error Page");
+ tab.linkedBrowser.browsingContext.currentWindowContext.getActor = sinon
+ .stub()
+ .resolves(mockExtractor);
+
+ setupBrowserWindowTracker(sb, createFakeWindow([tab]));
+
+ const result = await GetPageContent.getPageContent(
+ { url: targetUrl },
+ new Set([targetUrl])
+ );
+
+ Assert.ok(
+ result.includes("returned no content"),
+ "Should handle extraction error gracefully"
+ );
+ Assert.ok(result.includes("Error Page"), "Should include tab label");
+ } finally {
+ sb.restore();
+ }
+});
+
+add_task(async function test_getPageContent_viewport_mode() {
+ const sb = sinon.createSandbox();
+
+ try {
+ const targetUrl = "https://example.com/viewport";
+
+ const mockExtractor = {
+ getText: sinon.stub().resolves("Full page content"),
+ getReaderModeContent: sinon.stub().resolves(""),
+ };
+
+ const tab = createFakeTab(targetUrl, "Viewport Test");
+ tab.linkedBrowser.browsingContext.currentWindowContext.getActor = sinon
+ .stub()
+ .resolves(mockExtractor);
+
+ setupBrowserWindowTracker(sb, createFakeWindow([tab]));
+
+ const result = await GetPageContent.getPageContent(
+ { url: targetUrl },
+ new Set([targetUrl])
+ );
+
+ Assert.ok(
+ result.includes("Content (full page)"),
+ "Should use full mode by default"
+ );
+ Assert.ok(result.includes("Full page content"), "Should include content");
+ } finally {
+ sb.restore();
+ }
+});
+
+add_task(async function test_getPageContent_reader_mode_string() {
+ const sb = sinon.createSandbox();
+
+ try {
+ const targetUrl = "https://example.com/reader";
+ const readerContent = "Clean reader mode text";
+
+ const mockExtractor = {
+ getText: sinon.stub().resolves("Full content"),
+ getReaderModeContent: sinon.stub().resolves(readerContent),
+ };
+
+ const tab = createFakeTab(targetUrl, "Reader Test");
+ tab.linkedBrowser.browsingContext.currentWindowContext.getActor = sinon
+ .stub()
+ .resolves(mockExtractor);
+
+ setupBrowserWindowTracker(sb, createFakeWindow([tab]));
+
+ const result = await GetPageContent.getPageContent(
+ { url: targetUrl },
+ new Set([targetUrl])
+ );
+
+ Assert.ok(
+ result.includes("Content (reader mode)"),
+ "Should use reader mode by default"
+ );
+ Assert.ok(
+ result.includes(readerContent),
+ "Should include reader mode content"
+ );
+ } finally {
+ sb.restore();
+ }
+});
+
+add_task(async function test_getPageContent_no_window() {
+ const sb = sinon.createSandbox();
+
+ try {
+ const targetUrl = "https://example.com";
+ setupBrowserWindowTracker(sb, null);
+
+ // Add URL to allowed list so it checks for window instead of trying headless
+ const result = await GetPageContent.getPageContent(
+ { url: targetUrl },
+ new Set([targetUrl])
+ );
+
+ Assert.ok(
+ result.includes("Error retrieving content"),
+ "Should handle null window gracefully"
+ );
+ } finally {
+ sb.restore();
+ }
+});
+
+add_task(async function test_getPageContent_closed_window() {
+ const sb = sinon.createSandbox();
+
+ try {
+ const targetUrl = "https://example.com";
+ const closedWindow = {
+ closed: true,
+ gBrowser: { tabs: [] },
+ };
+
+ setupBrowserWindowTracker(sb, closedWindow);
+
+ // Add URL to allowed list so it checks for window instead of trying headless
+ const result = await GetPageContent.getPageContent(
+ { url: targetUrl },
+ new Set([targetUrl])
+ );
+
+ Assert.ok(
+ result.includes("Error retrieving content") ||
+ result.includes("Cannot find URL"),
+ "Should handle closed window with error"
+ );
+ } finally {
+ sb.restore();
+ }
+});
+
+add_task(async function test_getPageContent_window_without_gBrowser() {
+ const sb = sinon.createSandbox();
+
+ try {
+ const targetUrl = "https://example.com";
+ const windowWithoutGBrowser = {
+ closed: false,
+ gBrowser: null,
+ };
+
+ setupBrowserWindowTracker(sb, windowWithoutGBrowser);
+
+ // Add URL to allowed list so it checks for window instead of trying headless
+ const result = await GetPageContent.getPageContent(
+ { url: targetUrl },
+ new Set([targetUrl])
+ );
+
+ Assert.ok(
+ result.includes("Error retrieving content"),
+ "Should handle window without gBrowser"
+ );
+ } finally {
+ sb.restore();
+ }
+});
+
+add_task(async function test_getPageContent_whitespace_normalization() {
+ const sb = sinon.createSandbox();
+
+ try {
+ const targetUrl = "https://example.com/whitespace";
+ const messyContent =
+ "Text with lots\n\n\nof whitespace\n\n\n\nhere";
+
+ const mockExtractor = {
+ getText: sinon.stub().resolves(messyContent),
+ getReaderModeContent: sinon.stub().resolves(""),
+ };
+
+ const tab = createFakeTab(targetUrl, "Whitespace Test");
+ tab.linkedBrowser.browsingContext.currentWindowContext.getActor = sinon
+ .stub()
+ .resolves(mockExtractor);
+
+ setupBrowserWindowTracker(sb, createFakeWindow([tab]));
+
+ const result = await GetPageContent.getPageContent(
+ { url: targetUrl },
+ new Set([targetUrl])
+ );
+
+ Assert.ok(
+ result.includes("Text with lots of whitespace here"),
+ "Should normalize whitespace"
+ );
+ Assert.ok(
+ !result.includes(" "),
+ "Should not have multiple consecutive spaces"
+ );
+ } finally {
+ sb.restore();
+ }
+});
+
+add_task(async function test_getPageContent_invalid_url_format() {
+ const sb = sinon.createSandbox();
+
+ try {
+ const targetUrl = "not-a-valid-url";
+ const tabs = [createFakeTab("https://example.com", "Example")];
+
+ setupBrowserWindowTracker(sb, createFakeWindow(tabs));
+
+ // Add URL to allowed list so it searches tabs instead of trying headless
+ const result = await GetPageContent.getPageContent(
+ { url: targetUrl },
+ new Set([targetUrl])
+ );
+
+ Assert.ok(
+ result.includes("Cannot find URL"),
+ "Should handle invalid URL format"
+ );
+ } finally {
+ sb.restore();
+ }
+});
+
+add_task(async function test_getPageContent_extraction_returns_string() {
+ const sb = sinon.createSandbox();
+
+ try {
+ const targetUrl = "https://example.com/string";
+ const directString = "Direct string content";
+
+ const mockExtractor = {
+ getText: sinon.stub().resolves(directString),
+ getReaderModeContent: sinon.stub().resolves(""),
+ };
+
+ const tab = createFakeTab(targetUrl, "String Test");
+ tab.linkedBrowser.browsingContext.currentWindowContext.getActor = sinon
+ .stub()
+ .resolves(mockExtractor);
+
+ setupBrowserWindowTracker(sb, createFakeWindow([tab]));
+
+ const result = await GetPageContent.getPageContent(
+ { url: targetUrl },
+ new Set([targetUrl])
+ );
+
+ Assert.ok(
+ result.includes(directString),
+ "Should handle extraction returning string directly"
+ );
+ } finally {
+ sb.restore();
+ }
+});
+
+add_task(async function test_getPageContent_extraction_returns_object() {
+ const sb = sinon.createSandbox();
+
+ try {
+ const targetUrl = "https://example.com/object";
+ // The API now expects strings, not objects
+ // If getText returns a non-string object, it should be treated as no content
+ const objectContent = { text: "Object text content" };
+
+ const mockExtractor = {
+ getText: sinon.stub().resolves(objectContent),
+ getReaderModeContent: sinon.stub().resolves(""),
+ };
+
+ const tab = createFakeTab(targetUrl, "Object Test");
+ tab.linkedBrowser.browsingContext.currentWindowContext.getActor = sinon
+ .stub()
+ .resolves(mockExtractor);
+
+ setupBrowserWindowTracker(sb, createFakeWindow([tab]));
+
+ const result = await GetPageContent.getPageContent(
+ { url: targetUrl },
+ new Set([targetUrl])
+ );
+
+ // API expects strings now, objects are treated as no content
+ Assert.ok(
+ result.includes("returned no content"),
+ "Should treat object return value as no content"
+ );
+ } finally {
+ sb.restore();
+ }
+});
+
+add_task(
+ async function test_getPageContent_extraction_returns_non_string_text() {
+ const sb = sinon.createSandbox();
+
+ try {
+ const targetUrl = "https://example.com/nonstring";
+
+ const mockExtractor = {
+ getText: sinon.stub().resolves(12345),
+ getReaderModeContent: sinon.stub().resolves(""),
+ };
+
+ const tab = createFakeTab(targetUrl, "Non-string Test");
+ tab.linkedBrowser.browsingContext.currentWindowContext.getActor = sinon
+ .stub()
+ .resolves(mockExtractor);
+
+ setupBrowserWindowTracker(sb, createFakeWindow([tab]));
+
+ const result = await GetPageContent.getPageContent(
+ { url: targetUrl },
+ new Set([targetUrl])
+ );
+
+ Assert.ok(
+ result.includes("returned no content"),
+ "Should handle non-string text property as empty"
+ );
+ } finally {
+ sb.restore();
+ }
+ }
+);
+
+add_task(async function test_getPageContent_allowed_urls_set() {
+ const sb = sinon.createSandbox();
+
+ try {
+ const targetUrl = "https://allowed.com/page";
+ const tabs = [createFakeTab("https://other.com", "Other")];
+
+ setupBrowserWindowTracker(sb, createFakeWindow(tabs));
+
+ const allowedUrls = new Set([
+ "https://allowed.com/page",
+ "https://another-allowed.com",
+ ]);
+
+ const result = await GetPageContent.getPageContent(
+ { url: targetUrl },
+ allowedUrls
+ );
+
+ // Headless extraction doesn't work in xpcshell environment
+ Assert.ok(
+ result.includes("Cannot find URL"),
+ "Should return error when tab not found (headless doesn't work in xpcshell)"
+ );
+ } finally {
+ sb.restore();
+ }
+});
+
+add_task(async function test_getPageContent_available_tabs_list() {
+ const sb = sinon.createSandbox();
+
+ try {
+ const targetUrl = "https://notfound.com";
+ const tabs = [
+ createFakeTab("https://first.com", "First Tab"),
+ createFakeTab("https://second.com", "Second Tab"),
+ createFakeTab("https://third.com", "Third Tab"),
+ createFakeTab("https://fourth.com", "Fourth Tab"),
+ ];
+
+ setupBrowserWindowTracker(sb, createFakeWindow(tabs));
+
+ // Add the URL to allowed list so it searches tabs instead of trying headless
+ const result = await GetPageContent.getPageContent(
+ { url: targetUrl },
+ new Set([targetUrl])
+ );
+
+ // URL is in allowed list but not open, so should get error
+ Assert.ok(
+ result.includes("Cannot find URL"),
+ "Should return error when tab not found"
+ );
+ Assert.ok(
+ result.includes(targetUrl),
+ "Should include requested URL in error"
+ );
+ } finally {
+ sb.restore();
+ }
+});
diff --git a/browser/components/aiwindow/models/tests/xpcshell/xpcshell.toml b/browser/components/aiwindow/models/tests/xpcshell/xpcshell.toml
@@ -30,6 +30,8 @@ support-files = []
["test_Tools_GetOpenTabs.js"]
+["test_Tools_GetPageContent.js"]
+
["test_Tools_SearchBrowsingHistory.js"]
["test_Utils.js"]