[ tor-browser ].git.dasho

commit 738780f1f83bcfe282d30051b08c666d5b778353
parent 5ea2407af277878756eef5539935066c443bf35c
Author: Tom Zhang <tzhang@mozilla.com>
Date:   Thu, 18 Dec 2025 17:11:19 +0000

Bug 2004843 - Implement get_page_content tool r=gregtatum,ai-models-reviewers,tburrell

Differential Revision: https://phabricator.services.mozilla.com/D276302

Diffstat:
M browser/components/aiwindow/models/Tools.sys.mjs  | 284 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
M browser/components/aiwindow/models/moz.build  | 4 ++++
A browser/components/aiwindow/models/tests/browser/browser.toml  | 6 ++++++
A browser/components/aiwindow/models/tests/browser/browser_get_page_content.js  | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A browser/components/aiwindow/models/tests/browser/head.js  | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A browser/components/aiwindow/models/tests/xpcshell/test_Tools_GetPageContent.js  | 819 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M browser/components/aiwindow/models/tests/xpcshell/xpcshell.toml  | 2 ++

7 files changed, 1228 insertions(+), 16 deletions(-)
diff --git a/browser/components/aiwindow/models/Tools.sys.mjs b/browser/components/aiwindow/models/Tools.sys.mjs
@@ -5,10 +5,11 @@
  */
 
 /**
- * This file contains LLM tool abscrations and tool definitions.
+ * This file contains LLM tool abstractions and tool definitions.
  */
 
 import { searchBrowsingHistory as implSearchBrowsingHistory } from "moz-src:///browser/components/aiwindow/models/SearchBrowsingHistory.sys.mjs";
+import { PageExtractorParent } from "resource://gre/actors/PageExtractorParent.sys.mjs";
 
 const lazy = {};
 ChromeUtils.defineESModuleGetters(lazy, {
@@ -19,18 +20,19 @@ ChromeUtils.defineESModuleGetters(lazy, {
 
 const GET_OPEN_TABS = "get_open_tabs";
 const SEARCH_BROWSING_HISTORY = "search_browsing_history";
+const GET_PAGE_CONTENT = "get_page_content";
 
-// eslint-disable-next-line no-unused-vars
-const TOOLS = [GET_OPEN_TABS, SEARCH_BROWSING_HISTORY];
+export const TOOLS = [GET_OPEN_TABS, SEARCH_BROWSING_HISTORY, GET_PAGE_CONTENT];
 
-// eslint-disable-next-line no-unused-vars
-const toolsConfig = [
+export const toolsConfig = [
   {
     type: "function",
     function: {
       name: GET_OPEN_TABS,
       description:
-        "Access the user's browser and return a list of most recently browsed tabs. Each tab is represented by a JSON with the page's url, title and description if available. Default to return maximum 15 tabs.",
+        "Access the user's browser and return a list of most recently browsed tabs. " +
+        "Each tab is represented by a JSON with the page's url, title and description " +
+        "if available. Default to return maximum 15 tabs.",
       parameters: {
         type: "object",
         properties: {},
@@ -42,25 +44,39 @@ const toolsConfig = [
     function: {
       name: SEARCH_BROWSING_HISTORY,
       description:
-        'Refind pages from the user\'s PAST BROWSING HISTORY. Use this whenever the user wants to recall, review, list, or see pages they visited earlier (for a topic, site, or time period). Also use this when the user requests all pages from a past time period (e.g., "yesterday", "last week"), even if no topic is specified. Do NOT use for open tabs, completely general web questions, or abstract questions about "history" or habits.',
+        "Refind pages from the user's PAST BROWSING HISTORY. Use this whenever the " +
+        "user wants to recall, review, list, or see pages they visited earlier (for a " +
+        "topic, site, or time period). Also use this when the user requests all pages " +
+        'from a past time period (e.g., "yesterday", "last week"), even if no topic is ' +
+        "specified. Do NOT use for open tabs, completely general web questions, or " +
+        'abstract questions about "history" or habits.',
       parameters: {
         type: "object",
         properties: {
           searchTerm: {
             type: "string",
             description:
-              "A detailed, noun-heavy phrase (~2-12 meaningful tokens) summarizing the user's intent for semantic retrieval. Include the main entity/topic plus 1-3 contextual qualifiers (e.g., library name, purpose, site, or timeframe). Avoid vague or single-word queries.",
+              "A detailed, noun-heavy phrase (~2-12 meaningful tokens) summarizing " +
+              "the user's intent for semantic retrieval. Include the main entity/topic " +
+              "plus 1-3 contextual qualifiers (e.g., library name, purpose, site, or " +
+              "timeframe). Avoid vague or single-word queries.",
           },
           startTs: {
             type: "string",
             description:
-              "Inclusive lower bound of the time window as an ISO 8601 datetime string (e.g., '2025-11-07T09:00:00-05:00'). Use when the user asks for results within a time or range start, such as 'last week', 'since yesterday', or 'last night'. This must be before the user's current datetime.",
+              "Inclusive lower bound of the time window as an ISO 8601 datetime string " +
+              "(e.g., '2025-11-07T09:00:00-05:00'). Use when the user asks for results " +
+              "within a time or range start, such as 'last week', 'since yesterday', or" +
+              "'last night'. This must be before the user's current datetime.",
             default: null,
           },
           endTs: {
             type: "string",
             description:
-              "Inclusive upper bound of the time window as an ISO 8601 datetime string (e.g., '2025-11-07T21:00:00-05:00'). Use when the user asks for results within a time or range end, such as 'last week', 'between 2025-10-01 and 2025-10-31', or 'before Monday'. This must be before the user's current datetime.",
+              "Inclusive upper bound of the time window as an ISO 8601 datetime string " +
+              "(e.g., '2025-11-07T21:00:00-05:00'). Use when the user asks for results " +
+              "within a time or range end, such as 'last week', 'between 2025-10-01 and " +
+              "2025-10-31', or 'before Monday'. This must be before the user's current datetime.",
             default: null,
           },
         },
@@ -68,6 +84,26 @@ const toolsConfig = [
       },
     },
   },
+  {
+    type: "function",
+    function: {
+      name: GET_PAGE_CONTENT,
+      description:
+        "Retrieve cleaned text content of the provided browser page URL.",
+      parameters: {
+        properties: {
+          url: {
+            type: "string",
+            description:
+              "The complete URL of the page to fetch content from. This must exactly match " +
+              "a URL from the current conversation context. Use the full URL including " +
+              "protocol (http/https). Example: 'https://www.example.com/article'.",
+          },
+        },
+        required: ["url"],
+      },
+    },
+  },
 ];
 
 /**
@@ -126,7 +162,7 @@ export async function getOpenTabs(n = 15) {
 }
 
 /**
- * Tool entrypoint for browsing history search.
+ * Tool entrypoint for search_browsing_history.
  *
  * Parameters (defaults shown):
  * - searchTerm: ""        - string used for search
@@ -136,16 +172,16 @@ export async function getOpenTabs(n = 15) {
  *
  * Detailed behavior and implementation are in SearchBrowsingHistory.sys.mjs.
  *
- * @param {object} params
+ * @param {object} toolParams
  *  The search parameters.
- * @param {string} params.searchTerm
+ * @param {string} toolParams.searchTerm
  *  The search string. If null or empty, semantic search is skipped and
  *  results are filtered by time range and sorted by last_visit_date and frecency.
- * @param {string|null} params.startTs
+ * @param {string|null} toolParams.startTs
  *  Optional ISO-8601 start timestamp (e.g. "2025-11-07T09:00:00-05:00").
- * @param {string|null} params.endTs
+ * @param {string|null} toolParams.endTs
  *  Optional ISO-8601 end timestamp (e.g. "2025-11-07T09:00:00-05:00").
- * @param {number} params.historyLimit
+ * @param {number} toolParams.historyLimit
  *  Maximum number of history results to return.
  * @returns {Promise<object>}
  *  A promise resolving to an object with the search term and history results.
@@ -200,3 +236,219 @@ export function stripSearchBrowsingHistoryFields(result) {
     return result;
   }
 }
+
+/**
+ * Class for handling page content extraction with configurable modes and limits.
+ */
+export class GetPageContent {
+  static DEFAULT_MODE = "reader";
+  static FALLBACK_MODE = "full";
+  static MAX_CHARACTERS = 10000;
+
+  static MODE_HANDLERS = {
+    viewport: async pageExtractor => {
+      const result = await pageExtractor.getText({ justViewport: true });
+      return { text: result.text };
+    },
+    reader: async pageExtractor => {
+      const text = await pageExtractor.getReaderModeContent();
+      return { text: typeof text === "string" ? text : "" };
+    },
+    full: async pageExtractor => {
+      const result = await pageExtractor.getText();
+      return { text: result };
+    },
+  };
+
+  /**
+   * Tool entrypoint for get_page_content.
+   *
+   * @param {object} toolParams
+   * @param {string} toolParams.url
+   * @param {Set<string>} allowedUrls
+   * @returns {Promise<string>}
+   *  A promise resolving to a string containing the extracted page content
+   *  with a descriptive header, or an error message if extraction fails.
+   */
+  static async getPageContent({ url }, allowedUrls) {
+    try {
+      // Search through the allowed URLs and extract directly if exists
+      if (!allowedUrls.has(url)) {
+        //  Bug 2006418  - This will load the page headlessly, and then extract the content.
+        // It might be a better idea to have the lifetime of the page be tied to the chat
+        // while it's open, and with a "keep alive" timeout. For now it's simpler to just
+        // load the page fresh every time.
+        return PageExtractorParent.getHeadlessExtractor(url, pageExtractor =>
+          this.#runExtraction(pageExtractor, this.DEFAULT_MODE, url)
+        );
+      }
+
+      // TODO: figure out what windows we can access to give permission here, and update this API
+      let win = lazy.BrowserWindowTracker.getTopWindow();
+      let gBrowser = win.gBrowser;
+      let tabs = gBrowser.tabs;
+
+      // Find the tab with the matching URL in browser
+      let targetTab = null;
+      for (let i = 0; i < tabs.length; i++) {
+        const tab = tabs[i];
+        const currentURI = tab?.linkedBrowser?.currentURI;
+        if (currentURI?.spec === url) {
+          targetTab = tab;
+          break;
+        }
+      }
+
+      // If no match, try hostname matching for cases where protocols differ
+      if (!targetTab) {
+        try {
+          const inputHostPort = new URL(url).host;
+          targetTab = tabs.find(tab => {
+            try {
+              const tabHostPort = tab.linkedBrowser.currentURI.hostPort;
+              return tabHostPort === inputHostPort;
+            } catch {
+              return false;
+            }
+          });
+        } catch {
+          // Invalid URL, continue with original logic
+        }
+      }
+
+      // If still no match, abort
+      if (!targetTab) {
+        return `Cannot find URL: ${url}, page content extraction failed.`;
+      }
+
+      // Attempt extraction
+      const currentWindowContext =
+        targetTab.linkedBrowser.browsingContext?.currentWindowContext;
+
+      if (!currentWindowContext) {
+        return `Cannot access content from "${targetTab.label}" at ${url}.`;
+        // Stripped message "The tab may still be loading or is not accessible." to not confuse the LLM
+      }
+
+      // Extract page content using PageExtractor
+      const pageExtractor =
+        await currentWindowContext.getActor("PageExtractor");
+
+      return this.#runExtraction(
+        pageExtractor,
+        this.DEFAULT_MODE,
+        `"${targetTab.label}" (${url})`
+      );
+    } catch (error) {
+      // Bug 2006425 - Decide on the strategy for error handling in tool calls
+      // i.e., will the LLM keep retrying get_page_content due to error?
+      console.error(error);
+      return `Error retrieving content from ${url}.`;
+      // Stripped ${error.message} content to not confruse the LLM
+    }
+  }
+
+  /**
+   * Main extraction function.
+   * label is of form `{tab.title} ({tab.url})`.
+   *
+   * @param {PageExtractor} pageExtractor
+   * @param {string} mode
+   * @param {string} label
+   * @returns {Promise<string>}
+   *  A promise resolving to a formatted string containing the page content
+   *  with mode and label information, or an error message if no content is available.
+   */
+  static async #runExtraction(pageExtractor, mode, label) {
+    const selectedMode =
+      typeof mode === "string" && this.MODE_HANDLERS[mode]
+        ? mode
+        : this.DEFAULT_MODE;
+    const handler = this.MODE_HANDLERS[selectedMode];
+    let extraction = null;
+
+    try {
+      extraction = await handler(pageExtractor);
+    } catch (err) {
+      console.error(
+        "[SmartWindow] get_page_content mode failed",
+        selectedMode,
+        err
+      );
+    }
+
+    let pageContent = "";
+    if (typeof extraction === "string") {
+      pageContent = extraction;
+    } else if (typeof extraction?.text === "string") {
+      pageContent = extraction.text;
+    }
+
+    // Track which mode was actually used (in case we fall back)
+    let actualMode = selectedMode;
+
+    // If reader mode returns no content, fall back to full mode
+    if (!pageContent && selectedMode === "reader") {
+      try {
+        const fallbackHandler = this.MODE_HANDLERS[this.FALLBACK_MODE];
+        extraction = await fallbackHandler(pageExtractor);
+        if (typeof extraction === "string") {
+          pageContent = extraction;
+        } else if (typeof extraction?.text === "string") {
+          pageContent = extraction.text;
+        }
+        if (pageContent) {
+          actualMode = this.FALLBACK_MODE;
+        }
+      } catch (err) {
+        console.error(
+          "[SmartWindow] get_page_content fallback mode failed",
+          this.FALLBACK_MODE,
+          err
+        );
+      }
+    }
+
+    if (!pageContent) {
+      return `get_page_content(${selectedMode}) returned no content for ${label}.`;
+      // Stripped message "Try another mode if you still need information." to not confruse the LLM
+    }
+
+    // Clean and truncate content for better LLM consumption
+    //  Bug 2006436 - Consider doing this directly in pageExtractor if absolutely needed.
+    let cleanContent = pageContent
+      .replace(/\s+/g, " ") // Normalize whitespace
+      .replace(/\n\s*\n/g, "\n") // Clean up line breaks
+      .trim();
+
+    // Limit content length but be more generous for LLM processing
+    // Bug 1995043 - once reader mode has length truncation,
+    // we can remove this and directly do this in pageExtractor.
+    if (cleanContent.length > this.MAX_CHARACTERS) {
+      // Try to cut at a sentence boundary
+      const truncatePoint = cleanContent.lastIndexOf(".", this.MAX_CHARACTERS);
+      if (truncatePoint > this.MAX_CHARACTERS - 100) {
+        cleanContent = cleanContent.substring(0, truncatePoint + 1);
+      } else {
+        cleanContent = cleanContent.substring(0, this.MAX_CHARACTERS) + "...";
+      }
+    }
+
+    let modeLabel;
+    switch (actualMode) {
+      case "viewport":
+        modeLabel = "current viewport";
+        break;
+      case "reader":
+        modeLabel = "reader mode";
+        break;
+      case "full":
+        modeLabel = "full page";
+        break;
+    }
+
+    return `Content (${modeLabel}) from ${label}:
+
+${cleanContent}`;
+  }
+}
diff --git a/browser/components/aiwindow/models/moz.build b/browser/components/aiwindow/models/moz.build
@@ -9,6 +9,10 @@ DIRS += [
     "prompts",
 ]
 
+BROWSER_CHROME_MANIFESTS += [
+    "tests/browser/browser.toml",
+]
+
 MOZ_SRC_FILES += [
     "Chat.sys.mjs",
     "ChatUtils.sys.mjs",
diff --git a/browser/components/aiwindow/models/tests/browser/browser.toml b/browser/components/aiwindow/models/tests/browser/browser.toml
@@ -0,0 +1,6 @@
+[DEFAULT]
+support-files = [
+  "head.js",
+]
+
+["browser_get_page_content.js"]
diff --git a/browser/components/aiwindow/models/tests/browser/browser_get_page_content.js b/browser/components/aiwindow/models/tests/browser/browser_get_page_content.js
@@ -0,0 +1,58 @@
+/* Any copyright is dedicated to the Public Domain.
+   http://creativecommons.org/publicdomain/zero/1.0/ */
+
+"use strict";
+
+/**
+ * Test that the get_page_content tool call can extract content from a page.
+ */
+add_task(async function test_get_page_content_basic() {
+  const html = `
+    <!DOCTYPE html>
+    <html>
+    <head>
+      <meta charset="utf-8">
+      <title>Test Page</title>
+    </head>
+    <body>
+      <article>
+        <h1>Sample Article Title</h1>
+        <p>This is the first paragraph with some sample content.</p>
+        <p>This is the second paragraph with additional information.</p>
+      </article>
+    </body>
+    </html>
+  `;
+
+  const { url, GetPageContent, cleanup } = await setupGetPageContentTest(html);
+
+  // Create an allowed URLs set containing the test page
+  const allowedUrls = new Set([url]);
+
+  // Call the tool with the URL
+  const result = await GetPageContent.getPageContent({ url }, allowedUrls);
+
+  info("Extraction result: " + result);
+
+  // Verify the result contains expected content
+  ok(
+    result.includes("Sample Article Title"),
+    "Result should contain the title"
+  );
+  ok(
+    result.includes("first paragraph"),
+    "Result should contain text from the first paragraph"
+  );
+  ok(
+    result.includes("second paragraph"),
+    "Result should contain text from the second paragraph"
+  );
+
+  // Verify the result indicates which extraction mode was used
+  ok(
+    result.startsWith("Content (") && result.includes(") from"),
+    "Result should indicate the extraction mode used"
+  );
+
+  await cleanup();
+});
diff --git a/browser/components/aiwindow/models/tests/browser/head.js b/browser/components/aiwindow/models/tests/browser/head.js
@@ -0,0 +1,71 @@
+/* Any copyright is dedicated to the Public Domain.
+   http://creativecommons.org/publicdomain/zero/1.0/ */
+
+"use strict";
+
+/**
+ * Start an HTTP server that serves HTML content.
+ *
+ * @param {string} html - The HTML content to serve
+ * @returns {object} An object containing:
+ *   - url: The URL where the content is served
+ *   - serverClosed: Promise that resolves when the server stops
+ */
+function serveHTML(html) {
+  const { HttpServer } = ChromeUtils.importESModule(
+    "resource://testing-common/httpd.sys.mjs"
+  );
+
+  const server = new HttpServer();
+
+  server.registerPathHandler("/test-page.html", (_request, response) => {
+    response.setHeader("Content-Type", "text/html");
+    response.write(html);
+  });
+
+  server.start(-1);
+
+  const { primaryHost, primaryPort } = server.identity;
+  // eslint-disable-next-line @microsoft/sdl/no-insecure-url
+  const url = `http://${primaryHost}:${primaryPort}/test-page.html`;
+
+  return {
+    url,
+    server,
+  };
+}
+
+/**
+ * Set up a test for the get_page_content tool call by serving HTML and loading it.
+ *
+ * @param {string} html - The HTML content to serve and test
+ * @returns {Promise<object>} An object containing:
+ *   - tab: The opened browser tab
+ *   - url: The URL of the loaded page
+ *   - GetPageContent: The GetPageContent class
+ *   - cleanup: Function to clean up the test
+ */
+async function setupGetPageContentTest(html) {
+  const { GetPageContent } = ChromeUtils.importESModule(
+    "moz-src:///browser/components/aiwindow/models/Tools.sys.mjs"
+  );
+
+  const { url, server } = serveHTML(html);
+
+  const tab = await BrowserTestUtils.openNewForegroundTab(
+    gBrowser,
+    url,
+    true // waitForLoad
+  );
+
+  return {
+    tab,
+    url,
+    GetPageContent,
+    async cleanup() {
+      info("Cleaning up test");
+      BrowserTestUtils.removeTab(tab);
+      await new Promise(resolve => server.stop(resolve));
+    },
+  };
+}
diff --git a/browser/components/aiwindow/models/tests/xpcshell/test_Tools_GetPageContent.js b/browser/components/aiwindow/models/tests/xpcshell/test_Tools_GetPageContent.js
@@ -0,0 +1,819 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+const { GetPageContent } = ChromeUtils.importESModule(
+  "moz-src:///browser/components/aiwindow/models/Tools.sys.mjs"
+);
+
+const { sinon } = ChromeUtils.importESModule(
+  "resource://testing-common/Sinon.sys.mjs"
+);
+
+function createFakeBrowser(url, hasBrowsingContext = true) {
+  const parsedUrl = new URL(url);
+  const browser = {
+    currentURI: {
+      spec: url,
+      hostPort: parsedUrl.host,
+    },
+  };
+
+  if (hasBrowsingContext) {
+    browser.browsingContext = {
+      currentWindowContext: {
+        getActor: sinon.stub().resolves({
+          getText: sinon.stub().resolves("Sample page content"),
+          getReaderModeContent: sinon.stub().resolves(""),
+        }),
+      },
+    };
+  } else {
+    browser.browsingContext = null;
+  }
+
+  return browser;
+}
+
+function createFakeTab(url, title, hasBrowsingContext = true) {
+  return {
+    linkedBrowser: createFakeBrowser(url, hasBrowsingContext),
+    label: title,
+  };
+}
+
+function createFakeWindow(tabs) {
+  return {
+    closed: false,
+    gBrowser: {
+      tabs,
+    },
+  };
+}
+
+function setupBrowserWindowTracker(sandbox, window) {
+  const BrowserWindowTracker = ChromeUtils.importESModule(
+    "resource:///modules/BrowserWindowTracker.sys.mjs"
+  ).BrowserWindowTracker;
+
+  sandbox.stub(BrowserWindowTracker, "getTopWindow").returns(window);
+}
+
+add_task(async function test_getPageContent_exact_url_match() {
+  const sb = sinon.createSandbox();
+
+  try {
+    const targetUrl = "https://example.com/page";
+    const tabs = [
+      createFakeTab("https://other.com", "Other"),
+      createFakeTab(targetUrl, "Example Page"),
+    ];
+
+    setupBrowserWindowTracker(sb, createFakeWindow(tabs));
+
+    const result = await GetPageContent.getPageContent(
+      { url: targetUrl },
+      new Set([targetUrl])
+    );
+
+    Assert.ok(result.includes("Example Page"), "Should include page title");
+    Assert.ok(
+      result.includes("Sample page content"),
+      "Should include page content"
+    );
+    Assert.ok(
+      result.includes(targetUrl),
+      "Should include URL in result message"
+    );
+  } finally {
+    sb.restore();
+  }
+});
+
+add_task(async function test_getPageContent_normalized_url_match() {
+  const sb = sinon.createSandbox();
+
+  try {
+    const tabs = [
+      createFakeTab("https://example.com/page/", "Example Page"),
+      createFakeTab("https://other.com", "Other"),
+    ];
+
+    setupBrowserWindowTracker(sb, createFakeWindow(tabs));
+
+    const result = await GetPageContent.getPageContent(
+      { url: "https://example.com/page" },
+      new Set(["https://example.com/page"])
+    );
+
+    Assert.ok(
+      result.includes("Example Page"),
+      "Should match URL after normalizing trailing slashes"
+    );
+    Assert.ok(
+      result.includes("Sample page content"),
+      "Should include page content"
+    );
+  } finally {
+    sb.restore();
+  }
+});
+
+add_task(async function test_getPageContent_hostname_match() {
+  const sb = sinon.createSandbox();
+
+  try {
+    const tabs = [
+      createFakeTab("https://example.com/page", "Example Page"),
+      createFakeTab("https://other.com", "Other"),
+    ];
+
+    setupBrowserWindowTracker(sb, createFakeWindow(tabs));
+
+    const result = await GetPageContent.getPageContent(
+      { url: "http://example.com/different" },
+      new Set(["http://example.com/different"])
+    );
+
+    Assert.ok(
+      result.includes("Example Page"),
+      "Should match by hostname when exact match fails"
+    );
+    Assert.ok(
+      result.includes("Sample page content"),
+      "Should include page content"
+    );
+  } finally {
+    sb.restore();
+  }
+});
+
+add_task(async function test_getPageContent_tab_not_found_with_allowed_url() {
+  const sb = sinon.createSandbox();
+
+  try {
+    const targetUrl = "https://external.com/article";
+    const tabs = [
+      createFakeTab("https://example.com", "Example"),
+      createFakeTab("https://other.com", "Other"),
+    ];
+
+    setupBrowserWindowTracker(sb, createFakeWindow(tabs));
+
+    const allowedUrls = new Set([targetUrl]);
+    const result = await GetPageContent.getPageContent(
+      { url: targetUrl },
+      allowedUrls
+    );
+
+    // Headless extraction doesn't work in xpcshell environment
+    // In real usage, this would attempt headless extraction for allowed URLs
+    Assert.ok(
+      result.includes("Cannot find URL"),
+      "Should return error when tab not found (headless doesn't work in xpcshell)"
+    );
+    Assert.ok(result.includes(targetUrl), "Should include target URL in error");
+  } finally {
+    sb.restore();
+  }
+});
+
+add_task(
+  async function test_getPageContent_tab_not_found_without_allowed_url() {
+    const sb = sinon.createSandbox();
+
+    try {
+      const targetUrl = "https://notfound.com/page";
+      const tabs = [
+        createFakeTab("https://example.com", "Example"),
+        createFakeTab("https://other.com", "Other"),
+        createFakeTab("https://third.com", "Third"),
+        createFakeTab("https://fourth.com", "Fourth"),
+      ];
+
+      setupBrowserWindowTracker(sb, createFakeWindow(tabs));
+
+      const allowedUrls = new Set(["https://different.com"]);
+
+      // When URL is not in allowedUrls, it attempts headless extraction
+      // This doesn't work in xpcshell, so we expect an error
+      let errorThrown = false;
+      try {
+        await GetPageContent.getPageContent({ url: targetUrl }, allowedUrls);
+      } catch (error) {
+        errorThrown = true;
+        Assert.ok(
+          error.message.includes("addProgressListener"),
+          "Should fail with headless browser error in xpcshell"
+        );
+      }
+
+      Assert.ok(
+        errorThrown,
+        "Should throw error when attempting headless extraction in xpcshell"
+      );
+    } finally {
+      sb.restore();
+    }
+  }
+);
+
+add_task(async function test_getPageContent_no_browsing_context() {
+  const sb = sinon.createSandbox();
+
+  try {
+    const targetUrl = "https://example.com/loading";
+    const tabs = [createFakeTab(targetUrl, "Loading Page", false)];
+
+    setupBrowserWindowTracker(sb, createFakeWindow(tabs));
+
+    const result = await GetPageContent.getPageContent(
+      { url: targetUrl },
+      new Set([targetUrl])
+    );
+
+    Assert.ok(
+      result.includes("Cannot access content"),
+      "Should return error for unavailable browsing context"
+    );
+    Assert.ok(
+      result.includes("Loading Page"),
+      "Should include tab label in error"
+    );
+    Assert.ok(
+      result.includes(targetUrl),
+      "Should include URL in error message"
+    );
+  } finally {
+    sb.restore();
+  }
+});
+
+add_task(async function test_getPageContent_successful_extraction() {
+  const sb = sinon.createSandbox();
+
+  try {
+    const targetUrl = "https://example.com/article";
+    const pageContent = "This is a well-written article with lots of content.";
+
+    const mockExtractor = {
+      getText: sinon.stub().resolves(pageContent),
+      getReaderModeContent: sinon.stub().resolves(""),
+    };
+
+    const tab = createFakeTab(targetUrl, "Article");
+    tab.linkedBrowser.browsingContext.currentWindowContext.getActor = sinon
+      .stub()
+      .resolves(mockExtractor);
+
+    setupBrowserWindowTracker(sb, createFakeWindow([tab]));
+
+    const result = await GetPageContent.getPageContent(
+      { url: targetUrl },
+      new Set([targetUrl])
+    );
+
+    Assert.ok(result.includes("Content (full page)"), "Should indicate mode");
+    Assert.ok(result.includes("Article"), "Should include tab title");
+    Assert.ok(result.includes(targetUrl), "Should include URL");
+    Assert.ok(result.includes(pageContent), "Should include extracted content");
+  } finally {
+    sb.restore();
+  }
+});
+
+add_task(async function test_getPageContent_content_truncation() {
+  const sb = sinon.createSandbox();
+
+  try {
+    const targetUrl = "https://example.com/long";
+    const longContent = "A".repeat(15000);
+
+    const mockExtractor = {
+      getText: sinon.stub().resolves(longContent),
+      getReaderModeContent: sinon.stub().resolves(""),
+    };
+
+    const tab = createFakeTab(targetUrl, "Long Page");
+    tab.linkedBrowser.browsingContext.currentWindowContext.getActor = sinon
+      .stub()
+      .resolves(mockExtractor);
+
+    setupBrowserWindowTracker(sb, createFakeWindow([tab]));
+
+    const result = await GetPageContent.getPageContent(
+      { url: targetUrl },
+      new Set([targetUrl])
+    );
+
+    const contentMatch = result.match(/Content \(full page\) from.*:\s*(.*)/s);
+    Assert.ok(contentMatch, "Should match content pattern");
+
+    const extractedContent = contentMatch[1].trim();
+    Assert.lessOrEqual(
+      extractedContent.length,
+      10003,
+      "Content should be truncated to ~10000 chars (with ...)"
+    );
+    Assert.ok(
+      extractedContent.endsWith("..."),
+      "Truncated content should end with ..."
+    );
+  } finally {
+    sb.restore();
+  }
+});
+
+add_task(async function test_getPageContent_truncation_at_sentence_boundary() {
+  const sb = sinon.createSandbox();
+
+  try {
+    const targetUrl = "https://example.com/sentences";
+    const sentence = "This is a sentence. ";
+    const longContent = sentence.repeat(600);
+
+    const mockExtractor = {
+      getText: sinon.stub().resolves(longContent),
+      getReaderModeContent: sinon.stub().resolves(""),
+    };
+
+    const tab = createFakeTab(targetUrl, "Sentences");
+    tab.linkedBrowser.browsingContext.currentWindowContext.getActor = sinon
+      .stub()
+      .resolves(mockExtractor);
+
+    setupBrowserWindowTracker(sb, createFakeWindow([tab]));
+
+    const result = await GetPageContent.getPageContent(
+      { url: targetUrl },
+      new Set([targetUrl])
+    );
+
+    const contentMatch = result.match(/Content \(full page\) from.*:\s*(.*)/s);
+    Assert.ok(contentMatch, "Should match content pattern");
+
+    const extractedContent = contentMatch[1].trim();
+    Assert.lessOrEqual(
+      extractedContent.length,
+      10001,
+      "Should truncate near 10000 chars"
+    );
+    Assert.ok(
+      extractedContent.endsWith("."),
+      "Should end at sentence boundary (period)"
+    );
+    Assert.ok(
+      !extractedContent.endsWith("..."),
+      "Should not have ... when truncated at sentence"
+    );
+  } finally {
+    sb.restore();
+  }
+});
+
+add_task(async function test_getPageContent_empty_content() {
+  const sb = sinon.createSandbox();
+
+  try {
+    const targetUrl = "https://example.com/empty";
+
+    const mockExtractor = {
+      getText: sinon.stub().resolves("   \n  \n   "),
+      getReaderModeContent: sinon.stub().resolves(""),
+    };
+
+    const tab = createFakeTab(targetUrl, "Empty Page");
+    tab.linkedBrowser.browsingContext.currentWindowContext.getActor = sinon
+      .stub()
+      .resolves(mockExtractor);
+
+    setupBrowserWindowTracker(sb, createFakeWindow([tab]));
+
+    const result = await GetPageContent.getPageContent(
+      { url: targetUrl },
+      new Set([targetUrl])
+    );
+
+    // Whitespace content is normalized but still returns success
+    Assert.ok(
+      result.includes("Content (full page)"),
+      "Should use full page mode after reader fallback"
+    );
+    Assert.ok(result.includes("Empty Page"), "Should include tab label");
+    // The content is essentially empty after normalization, but still returned
+    Assert.ok(
+      result.match(/:\s*$/),
+      "Content should be mostly empty after normalization"
+    );
+  } finally {
+    sb.restore();
+  }
+});
+
+add_task(async function test_getPageContent_extraction_error() {
+  const sb = sinon.createSandbox();
+
+  try {
+    const targetUrl = "https://example.com/error";
+
+    const mockExtractor = {
+      getText: sinon.stub().rejects(new Error("Extraction failed")),
+      getReaderModeContent: sinon.stub().resolves(""),
+    };
+
+    const tab = createFakeTab(targetUrl, "Error Page");
+    tab.linkedBrowser.browsingContext.currentWindowContext.getActor = sinon
+      .stub()
+      .resolves(mockExtractor);
+
+    setupBrowserWindowTracker(sb, createFakeWindow([tab]));
+
+    const result = await GetPageContent.getPageContent(
+      { url: targetUrl },
+      new Set([targetUrl])
+    );
+
+    Assert.ok(
+      result.includes("returned no content"),
+      "Should handle extraction error gracefully"
+    );
+    Assert.ok(result.includes("Error Page"), "Should include tab label");
+  } finally {
+    sb.restore();
+  }
+});
+
+add_task(async function test_getPageContent_viewport_mode() {
+  const sb = sinon.createSandbox();
+
+  try {
+    const targetUrl = "https://example.com/viewport";
+
+    const mockExtractor = {
+      getText: sinon.stub().resolves("Full page content"),
+      getReaderModeContent: sinon.stub().resolves(""),
+    };
+
+    const tab = createFakeTab(targetUrl, "Viewport Test");
+    tab.linkedBrowser.browsingContext.currentWindowContext.getActor = sinon
+      .stub()
+      .resolves(mockExtractor);
+
+    setupBrowserWindowTracker(sb, createFakeWindow([tab]));
+
+    const result = await GetPageContent.getPageContent(
+      { url: targetUrl },
+      new Set([targetUrl])
+    );
+
+    Assert.ok(
+      result.includes("Content (full page)"),
+      "Should use full mode by default"
+    );
+    Assert.ok(result.includes("Full page content"), "Should include content");
+  } finally {
+    sb.restore();
+  }
+});
+
+add_task(async function test_getPageContent_reader_mode_string() {
+  const sb = sinon.createSandbox();
+
+  try {
+    const targetUrl = "https://example.com/reader";
+    const readerContent = "Clean reader mode text";
+
+    const mockExtractor = {
+      getText: sinon.stub().resolves("Full content"),
+      getReaderModeContent: sinon.stub().resolves(readerContent),
+    };
+
+    const tab = createFakeTab(targetUrl, "Reader Test");
+    tab.linkedBrowser.browsingContext.currentWindowContext.getActor = sinon
+      .stub()
+      .resolves(mockExtractor);
+
+    setupBrowserWindowTracker(sb, createFakeWindow([tab]));
+
+    const result = await GetPageContent.getPageContent(
+      { url: targetUrl },
+      new Set([targetUrl])
+    );
+
+    Assert.ok(
+      result.includes("Content (reader mode)"),
+      "Should use reader mode by default"
+    );
+    Assert.ok(
+      result.includes(readerContent),
+      "Should include reader mode content"
+    );
+  } finally {
+    sb.restore();
+  }
+});
+
+add_task(async function test_getPageContent_no_window() {
+  const sb = sinon.createSandbox();
+
+  try {
+    const targetUrl = "https://example.com";
+    setupBrowserWindowTracker(sb, null);
+
+    // Add URL to allowed list so it checks for window instead of trying headless
+    const result = await GetPageContent.getPageContent(
+      { url: targetUrl },
+      new Set([targetUrl])
+    );
+
+    Assert.ok(
+      result.includes("Error retrieving content"),
+      "Should handle null window gracefully"
+    );
+  } finally {
+    sb.restore();
+  }
+});
+
+add_task(async function test_getPageContent_closed_window() {
+  const sb = sinon.createSandbox();
+
+  try {
+    const targetUrl = "https://example.com";
+    const closedWindow = {
+      closed: true,
+      gBrowser: { tabs: [] },
+    };
+
+    setupBrowserWindowTracker(sb, closedWindow);
+
+    // Add URL to allowed list so it checks for window instead of trying headless
+    const result = await GetPageContent.getPageContent(
+      { url: targetUrl },
+      new Set([targetUrl])
+    );
+
+    Assert.ok(
+      result.includes("Error retrieving content") ||
+        result.includes("Cannot find URL"),
+      "Should handle closed window with error"
+    );
+  } finally {
+    sb.restore();
+  }
+});
+
+add_task(async function test_getPageContent_window_without_gBrowser() {
+  const sb = sinon.createSandbox();
+
+  try {
+    const targetUrl = "https://example.com";
+    const windowWithoutGBrowser = {
+      closed: false,
+      gBrowser: null,
+    };
+
+    setupBrowserWindowTracker(sb, windowWithoutGBrowser);
+
+    // Add URL to allowed list so it checks for window instead of trying headless
+    const result = await GetPageContent.getPageContent(
+      { url: targetUrl },
+      new Set([targetUrl])
+    );
+
+    Assert.ok(
+      result.includes("Error retrieving content"),
+      "Should handle window without gBrowser"
+    );
+  } finally {
+    sb.restore();
+  }
+});
+
+add_task(async function test_getPageContent_whitespace_normalization() {
+  const sb = sinon.createSandbox();
+
+  try {
+    const targetUrl = "https://example.com/whitespace";
+    const messyContent =
+      "Text   with    lots\n\n\nof     whitespace\n\n\n\nhere";
+
+    const mockExtractor = {
+      getText: sinon.stub().resolves(messyContent),
+      getReaderModeContent: sinon.stub().resolves(""),
+    };
+
+    const tab = createFakeTab(targetUrl, "Whitespace Test");
+    tab.linkedBrowser.browsingContext.currentWindowContext.getActor = sinon
+      .stub()
+      .resolves(mockExtractor);
+
+    setupBrowserWindowTracker(sb, createFakeWindow([tab]));
+
+    const result = await GetPageContent.getPageContent(
+      { url: targetUrl },
+      new Set([targetUrl])
+    );
+
+    Assert.ok(
+      result.includes("Text with lots of whitespace here"),
+      "Should normalize whitespace"
+    );
+    Assert.ok(
+      !result.includes("   "),
+      "Should not have multiple consecutive spaces"
+    );
+  } finally {
+    sb.restore();
+  }
+});
+
+add_task(async function test_getPageContent_invalid_url_format() {
+  const sb = sinon.createSandbox();
+
+  try {
+    const targetUrl = "not-a-valid-url";
+    const tabs = [createFakeTab("https://example.com", "Example")];
+
+    setupBrowserWindowTracker(sb, createFakeWindow(tabs));
+
+    // Add URL to allowed list so it searches tabs instead of trying headless
+    const result = await GetPageContent.getPageContent(
+      { url: targetUrl },
+      new Set([targetUrl])
+    );
+
+    Assert.ok(
+      result.includes("Cannot find URL"),
+      "Should handle invalid URL format"
+    );
+  } finally {
+    sb.restore();
+  }
+});
+
+add_task(async function test_getPageContent_extraction_returns_string() {
+  const sb = sinon.createSandbox();
+
+  try {
+    const targetUrl = "https://example.com/string";
+    const directString = "Direct string content";
+
+    const mockExtractor = {
+      getText: sinon.stub().resolves(directString),
+      getReaderModeContent: sinon.stub().resolves(""),
+    };
+
+    const tab = createFakeTab(targetUrl, "String Test");
+    tab.linkedBrowser.browsingContext.currentWindowContext.getActor = sinon
+      .stub()
+      .resolves(mockExtractor);
+
+    setupBrowserWindowTracker(sb, createFakeWindow([tab]));
+
+    const result = await GetPageContent.getPageContent(
+      { url: targetUrl },
+      new Set([targetUrl])
+    );
+
+    Assert.ok(
+      result.includes(directString),
+      "Should handle extraction returning string directly"
+    );
+  } finally {
+    sb.restore();
+  }
+});
+
+add_task(async function test_getPageContent_extraction_returns_object() {
+  const sb = sinon.createSandbox();
+
+  try {
+    const targetUrl = "https://example.com/object";
+    // The API now expects strings, not objects
+    // If getText returns a non-string object, it should be treated as no content
+    const objectContent = { text: "Object text content" };
+
+    const mockExtractor = {
+      getText: sinon.stub().resolves(objectContent),
+      getReaderModeContent: sinon.stub().resolves(""),
+    };
+
+    const tab = createFakeTab(targetUrl, "Object Test");
+    tab.linkedBrowser.browsingContext.currentWindowContext.getActor = sinon
+      .stub()
+      .resolves(mockExtractor);
+
+    setupBrowserWindowTracker(sb, createFakeWindow([tab]));
+
+    const result = await GetPageContent.getPageContent(
+      { url: targetUrl },
+      new Set([targetUrl])
+    );
+
+    // API expects strings now, objects are treated as no content
+    Assert.ok(
+      result.includes("returned no content"),
+      "Should treat object return value as no content"
+    );
+  } finally {
+    sb.restore();
+  }
+});
+
+add_task(
+  async function test_getPageContent_extraction_returns_non_string_text() {
+    const sb = sinon.createSandbox();
+
+    try {
+      const targetUrl = "https://example.com/nonstring";
+
+      const mockExtractor = {
+        getText: sinon.stub().resolves(12345),
+        getReaderModeContent: sinon.stub().resolves(""),
+      };
+
+      const tab = createFakeTab(targetUrl, "Non-string Test");
+      tab.linkedBrowser.browsingContext.currentWindowContext.getActor = sinon
+        .stub()
+        .resolves(mockExtractor);
+
+      setupBrowserWindowTracker(sb, createFakeWindow([tab]));
+
+      const result = await GetPageContent.getPageContent(
+        { url: targetUrl },
+        new Set([targetUrl])
+      );
+
+      Assert.ok(
+        result.includes("returned no content"),
+        "Should handle non-string text property as empty"
+      );
+    } finally {
+      sb.restore();
+    }
+  }
+);
+
+add_task(async function test_getPageContent_allowed_urls_set() {
+  const sb = sinon.createSandbox();
+
+  try {
+    const targetUrl = "https://allowed.com/page";
+    const tabs = [createFakeTab("https://other.com", "Other")];
+
+    setupBrowserWindowTracker(sb, createFakeWindow(tabs));
+
+    const allowedUrls = new Set([
+      "https://allowed.com/page",
+      "https://another-allowed.com",
+    ]);
+
+    const result = await GetPageContent.getPageContent(
+      { url: targetUrl },
+      allowedUrls
+    );
+
+    // Headless extraction doesn't work in xpcshell environment
+    Assert.ok(
+      result.includes("Cannot find URL"),
+      "Should return error when tab not found (headless doesn't work in xpcshell)"
+    );
+  } finally {
+    sb.restore();
+  }
+});
+
+add_task(async function test_getPageContent_available_tabs_list() {
+  const sb = sinon.createSandbox();
+
+  try {
+    const targetUrl = "https://notfound.com";
+    const tabs = [
+      createFakeTab("https://first.com", "First Tab"),
+      createFakeTab("https://second.com", "Second Tab"),
+      createFakeTab("https://third.com", "Third Tab"),
+      createFakeTab("https://fourth.com", "Fourth Tab"),
+    ];
+
+    setupBrowserWindowTracker(sb, createFakeWindow(tabs));
+
+    // Add the URL to allowed list so it searches tabs instead of trying headless
+    const result = await GetPageContent.getPageContent(
+      { url: targetUrl },
+      new Set([targetUrl])
+    );
+
+    // URL is in allowed list but not open, so should get error
+    Assert.ok(
+      result.includes("Cannot find URL"),
+      "Should return error when tab not found"
+    );
+    Assert.ok(
+      result.includes(targetUrl),
+      "Should include requested URL in error"
+    );
+  } finally {
+    sb.restore();
+  }
+});
diff --git a/browser/components/aiwindow/models/tests/xpcshell/xpcshell.toml b/browser/components/aiwindow/models/tests/xpcshell/xpcshell.toml
@@ -30,6 +30,8 @@ support-files = []
 
 ["test_Tools_GetOpenTabs.js"]
 
+["test_Tools_GetPageContent.js"]
+
 ["test_Tools_SearchBrowsingHistory.js"]
 
 ["test_Utils.js"]

	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE

M	browser/components/aiwindow/models/Tools.sys.mjs	\|	284	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
M	browser/components/aiwindow/models/moz.build	\|	4	++++
A	browser/components/aiwindow/models/tests/browser/browser.toml	\|	6	++++++
A	browser/components/aiwindow/models/tests/browser/browser_get_page_content.js	\|	58	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	browser/components/aiwindow/models/tests/browser/head.js	\|	71	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	browser/components/aiwindow/models/tests/xpcshell/test_Tools_GetPageContent.js	\|	819	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	browser/components/aiwindow/models/tests/xpcshell/xpcshell.toml	\|	2	++