commit 565db6f03a080c85e1801e0fee948291e02e16b7
parent 6aff720ee5acf233af7ff3cb45721a485fd671fa
Author: Chidam Gopal <cgopal@mozilla.com>
Date: Mon, 24 Nov 2025 15:55:25 +0000
Bug 2000725 - Add history extraction for insights r=ai-models-reviewers,cdipersio
Extract history for insights
Differential Revision: https://phabricator.services.mozilla.com/D272954
Diffstat:
5 files changed, 379 insertions(+), 0 deletions(-)
diff --git a/browser/components/aiwindow/models/InsightsHistorySource.sys.mjs b/browser/components/aiwindow/models/InsightsHistorySource.sys.mjs
@@ -0,0 +1,160 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/**
+ * This module handles the visit extraction data from browsing history
+ */
+
+import { PlacesUtils } from "resource://gre/modules/PlacesUtils.sys.mjs";
+
+/**
+ * Fetch recent browsing history from Places (SQL), aggregate by URL,
+ * tag "search" vs "history", and filter low-visit URLs.
+ *
+ * @param {object} opts
+ * @param {number} [opts.days=60] How far back to look
+ * @param {number} [opts.maxResults=3000] Max rows to return (after sort)
+ * @returns {Promise<Array<{url:string,title:string,domain:string,visit_time:string,visit_count:number,source:'history'|'search'}>>}
+ */
+export async function getRecentHistory(opts = {}) {
+ const MS_PER_DAY = 86400000;
+ const MICROS_PER_MS = 1000;
+ const DEFAULT_DAYS = 60;
+ const DEFAULT_MAX_RESULTS = 3000;
+
+ const SEARCH_ENGINE_DOMAINS = [
+ "google",
+ "bing",
+ "duckduckgo",
+ "search.brave",
+ "yahoo",
+ "startpage",
+ "ecosia",
+ "baidu",
+ "yandex",
+ ];
+
+ const days = opts.days ?? DEFAULT_DAYS;
+ const maxResults = opts.maxResults ?? DEFAULT_MAX_RESULTS;
+
+ // Places stores visit_date in microseconds since epoch.
+ const cutoffMicros = Math.max(
+ 0,
+ (Date.now() - days * MS_PER_DAY) * MICROS_PER_MS
+ );
+
+ const isSearchVisit = urlStr => {
+ try {
+ const { hostname, pathname, search } = new URL(urlStr);
+
+ const searchEnginePattern = new RegExp(
+ `(^|\\.)(${SEARCH_ENGINE_DOMAINS.join("|")})\\.`,
+ "i"
+ );
+ const isSearchEngine = searchEnginePattern.test(hostname);
+
+ const looksLikeSearch =
+ /search|results|query/i.test(pathname) ||
+ /[?&](q|query|p)=/i.test(search);
+
+ return isSearchEngine && looksLikeSearch;
+ } catch (e) {
+ console.error("isSearchVisit: failed to parse URL", {
+ error: String(e),
+ urlLength: typeof urlStr === "string" ? urlStr.length : -1,
+ });
+ return false;
+ }
+ };
+
+ const SQL = `
+ WITH visit_info AS (
+ SELECT
+ p.id AS place_id,
+ p.url AS url,
+ o.host AS host,
+ p.title AS title,
+ v.visit_date AS visit_date,
+ p.frecency AS frecency,
+ CASE WHEN o.frecency = -1 THEN 1 ELSE o.frecency END AS domain_frecency
+ FROM moz_places p
+ JOIN moz_historyvisits v ON v.place_id = p.id
+ JOIN moz_origins o ON p.origin_id = o.id
+ WHERE v.visit_date >= :cutoff
+ AND p.title IS NOT NULL
+ AND p.frecency IS NOT NULL
+ ORDER BY v.visit_date DESC
+ LIMIT :limit
+ ),
+
+ /* Collapse to one row per place to compute percentiles (like your groupby/place_id mean) */
+ per_place AS (
+ SELECT
+ place_id,
+ MAX(frecency) AS frecency,
+ MAX(domain_frecency) AS domain_frecency
+ FROM visit_info
+ GROUP BY place_id
+ ),
+
+ /* Percentiles using window function CUME_DIST() */
+ per_place_with_pct AS (
+ SELECT
+ place_id,
+ ROUND(100.0 * CUME_DIST() OVER (ORDER BY frecency), 2) AS frecency_pct,
+ ROUND(100.0 * CUME_DIST() OVER (ORDER BY domain_frecency), 2) AS domain_frecency_pct
+ FROM per_place
+ )
+
+ /* Final rows: original visits + joined percentiles + source label */
+ SELECT
+ v.url,
+ v.host,
+ v.title,
+ v.visit_date,
+ p.frecency_pct,
+ p.domain_frecency_pct
+ FROM visit_info v
+ JOIN per_place_with_pct p USING (place_id)
+ ORDER BY v.visit_date DESC
+ `;
+
+ try {
+ const rows = await PlacesUtils.withConnectionWrapper(
+ "smartwindow-getRecentHistory",
+ async db => {
+ const stmt = await db.execute(SQL, {
+ cutoff: cutoffMicros,
+ limit: maxResults,
+ });
+
+ const out = [];
+ for (const row of stmt) {
+ const url = row.getResultByName("url");
+ const host = row.getResultByName("host");
+ const title = row.getResultByName("title") || "";
+ const visitDateMicros = row.getResultByName("visit_date") || 0;
+ const frequencyPct = row.getResultByName("frecency_pct") || 0;
+ const domainFrequencyPct =
+ row.getResultByName("domain_frecency_pct") || 0;
+
+ out.push({
+ url,
+ domain: host,
+ title,
+ visitDateMicros,
+ frequencyPct,
+ domainFrequencyPct,
+ source: isSearchVisit(url) ? "search" : "history",
+ });
+ }
+ return out;
+ }
+ );
+ return rows;
+ } catch (error) {
+ console.error("Failed to fetch Places history via SQL:", error);
+ return [];
+ }
+}
diff --git a/browser/components/aiwindow/models/moz.build b/browser/components/aiwindow/models/moz.build
@@ -4,3 +4,11 @@
with Files("**"):
BUG_COMPONENT = ("Core", "Machine Learning: General")
+
+MOZ_SRC_FILES += [
+ "InsightsHistorySource.sys.mjs",
+]
+
+XPCSHELL_TESTS_MANIFESTS += [
+ "tests/xpcshell/xpcshell.toml",
+]
diff --git a/browser/components/aiwindow/models/tests/xpcshell/head.js b/browser/components/aiwindow/models/tests/xpcshell/head.js
@@ -0,0 +1,17 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/* Any shared setup for these tests lives here. */
+const { PlacesUtils } = ChromeUtils.importESModule(
+ "resource://gre/modules/PlacesUtils.sys.mjs"
+);
+const { PlacesTestUtils } = ChromeUtils.importESModule(
+ "resource://testing-common/PlacesTestUtils.sys.mjs"
+);
+
+add_task(async function setup_profile() {
+ do_get_profile(); // ensure a profile dir (needed by Places)
+ // Start from a clean history DB
+ await PlacesUtils.history.clear();
+});
diff --git a/browser/components/aiwindow/models/tests/xpcshell/test_InsightsHistorySource.js b/browser/components/aiwindow/models/tests/xpcshell/test_InsightsHistorySource.js
@@ -0,0 +1,187 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+const { getRecentHistory } = ChromeUtils.importESModule(
+ "moz-src:///browser/components/aiwindow/models/InsightsHistorySource.sys.mjs"
+);
+
+add_task(async function test_basic_history_fetch_and_shape() {
+ // Seed a few visits spanning search + normal history.
+ const now = Date.now();
+
+ const seeded = [
+ {
+ url: "https://www.google.com/search?q=firefox+history",
+ title: "Google Search: firefox history",
+ visits: [{ date: new Date(now - 5 * 60 * 1000) }], // 5 min ago
+ },
+ {
+ url: "https://developer.mozilla.org/en-US/docs/Web/JavaScript",
+ title: "JavaScript | MDN",
+ visits: [{ date: new Date(now - 10 * 60 * 1000) }], // 10 min ago
+ },
+ {
+ url: "https://news.ycombinator.com/",
+ title: "Hacker News",
+ visits: [{ date: new Date(now - 15 * 60 * 1000) }],
+ },
+ {
+ url: "https://search.brave.com/search?q=mozsqlite",
+ title: "Brave Search: mozsqlite",
+ visits: [{ date: new Date(now - 20 * 60 * 1000) }],
+ },
+ {
+ url: "https://mozilla.org/en-US/",
+ title: "Internet for people, not profit — Mozilla",
+ visits: [{ date: new Date(now - 25 * 60 * 1000) }],
+ },
+ ];
+
+ // Insert via high-level API; Places will populate moz_origins/visits.
+ await PlacesUtils.history.insertMany(seeded);
+
+ const rows = await getRecentHistory({ days: 1, maxResults: 100 });
+ Assert.ok(Array.isArray(rows), "Should return an array");
+ Assert.greaterOrEqual(
+ rows.length,
+ seeded.length,
+ "Should return at least seeded rows"
+ );
+
+ // Verify required fields & types on a sample.
+ for (const row of rows.slice(0, 5)) {
+ Assert.strictEqual(typeof row.url, "string", "url is a string");
+ Assert.ok(row.url.length, "url present");
+ Assert.strictEqual(typeof row.domain, "string", "domain is a string");
+ Assert.ok(row.domain.length, "domain present");
+ Assert.strictEqual(typeof row.title, "string", "title is a string");
+ Assert.ok(typeof row.title.length, "title present");
+ Assert.strictEqual(
+ typeof row.frequencyPct,
+ "number",
+ "frequencyPct is a number"
+ );
+ Assert.strictEqual(
+ typeof row.domainFrequencyPct,
+ "number",
+ "domainFrequencyPct is a number"
+ );
+ Assert.ok(
+ row.source === "search" || row.source === "history",
+ "source labeled"
+ );
+ Assert.ok(
+ row.frequencyPct >= 0 && row.frequencyPct <= 100,
+ "frequencyPct within 0–100"
+ );
+ Assert.ok(
+ row.domainFrequencyPct >= 0 && row.domainFrequencyPct <= 100,
+ "domainFrequencyPct within 0–100"
+ );
+
+ Assert.strictEqual(
+ typeof row.visitDateMicros,
+ "number",
+ "visitDateMicros is a number"
+ );
+ Assert.ok(
+ Number.isFinite(row.visitDateMicros),
+ "visitDateMicros is finite"
+ );
+ Assert.greaterOrEqual(
+ row.visitDateMicros,
+ 0,
+ "visitDateMicros non-negative"
+ );
+ }
+
+ // Check ordering: newest first by visit_date.
+ const copy = rows.map(r => r.visitDateMicros);
+ const sorted = [...copy].sort((a, b) => b - a);
+ Assert.deepEqual(
+ copy.slice(0, 10),
+ sorted.slice(0, 10),
+ "Rows are ordered by visit date desc"
+ );
+
+ // Search-source tagging should catch major engines with query paths.
+ const byUrl = new Map(rows.map(r => [r.url, r]));
+ Assert.equal(
+ byUrl.get(seeded[0].url).source,
+ "search",
+ "Google search tagged as 'search'"
+ );
+ Assert.equal(
+ byUrl.get(seeded[3].url).source,
+ "search",
+ "Brave search tagged as 'search'"
+ );
+ Assert.equal(
+ byUrl.get(seeded[1].url).source,
+ "history",
+ "MDN should be 'history'"
+ );
+ Assert.equal(
+ byUrl.get(seeded[2].url).source,
+ "history",
+ "Hacker News should be 'history'"
+ );
+ Assert.equal(
+ byUrl.get(seeded[4].url).source,
+ "history",
+ "Internet for people, not profit — Mozilla"
+ );
+});
+
+add_task(async function test_maxResults_is_respected() {
+ // Create a burst of visits so we can test LIMIT behavior.
+ await PlacesUtils.history.clear();
+
+ const base = Date.now();
+ const toInsert = [];
+ for (let i = 0; i < 50; i++) {
+ toInsert.push({
+ url: `https://example.com/page-${i}`,
+ title: `Example Page ${i}`,
+ visits: [{ date: new Date(base - i * 1000) }],
+ });
+ }
+ await PlacesUtils.history.insertMany(toInsert);
+
+ const rows10 = await getRecentHistory({ days: 1, maxResults: 10 });
+ Assert.equal(rows10.length, 10, "maxResults=10 respected");
+
+ const rows5 = await getRecentHistory({ days: 1, maxResults: 5 });
+ Assert.equal(rows5.length, 5, "maxResults=5 respected");
+});
+
+add_task(async function test_days_cutoff_is_respected() {
+ await PlacesUtils.history.clear();
+
+ // One old (2 days), one recent (within 1 hour)
+ const now = Date.now();
+ await PlacesUtils.history.insertMany([
+ {
+ url: "https://old.example.com/",
+ title: "Old Visit",
+ visits: [{ date: new Date(now - 2 * 24 * 60 * 60 * 1000) }],
+ },
+ {
+ url: "https://recent.example.com/",
+ title: "Recent Visit",
+ visits: [{ date: new Date(now - 30 * 60 * 1000) }],
+ },
+ ]);
+
+ const rows = await getRecentHistory({ days: 1, maxResults: 50 });
+ const urls = rows.map(r => r.url);
+ Assert.ok(
+ urls.includes("https://recent.example.com/"),
+ "Recent visit present"
+ );
+ Assert.ok(
+ !urls.includes("https://old.example.com/"),
+ "Old visit filtered by days cutoff"
+ );
+});
diff --git a/browser/components/aiwindow/models/tests/xpcshell/xpcshell.toml b/browser/components/aiwindow/models/tests/xpcshell/xpcshell.toml
@@ -0,0 +1,7 @@
+[DEFAULT]
+run-if = ["os != 'android'"]
+head = "head.js"
+firefox-appdir = "browser"
+support-files = []
+
+["test_InsightsHistorySource.js"]