tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

commit 565db6f03a080c85e1801e0fee948291e02e16b7
parent 6aff720ee5acf233af7ff3cb45721a485fd671fa
Author: Chidam Gopal <cgopal@mozilla.com>
Date:   Mon, 24 Nov 2025 15:55:25 +0000

Bug 2000725 - Add history extraction for insights r=ai-models-reviewers,cdipersio

Extract history for insights

Differential Revision: https://phabricator.services.mozilla.com/D272954

Diffstat:
Abrowser/components/aiwindow/models/InsightsHistorySource.sys.mjs | 160+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mbrowser/components/aiwindow/models/moz.build | 8++++++++
Abrowser/components/aiwindow/models/tests/xpcshell/head.js | 17+++++++++++++++++
Abrowser/components/aiwindow/models/tests/xpcshell/test_InsightsHistorySource.js | 187+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Abrowser/components/aiwindow/models/tests/xpcshell/xpcshell.toml | 7+++++++
5 files changed, 379 insertions(+), 0 deletions(-)

diff --git a/browser/components/aiwindow/models/InsightsHistorySource.sys.mjs b/browser/components/aiwindow/models/InsightsHistorySource.sys.mjs @@ -0,0 +1,160 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/** + * This module handles the visit extraction data from browsing history + */ + +import { PlacesUtils } from "resource://gre/modules/PlacesUtils.sys.mjs"; + +/** + * Fetch recent browsing history from Places (SQL), aggregate by URL, + * tag "search" vs "history", and filter low-visit URLs. + * + * @param {object} opts + * @param {number} [opts.days=60] How far back to look + * @param {number} [opts.maxResults=3000] Max rows to return (after sort) + * @returns {Promise<Array<{url:string,title:string,domain:string,visit_time:string,visit_count:number,source:'history'|'search'}>>} + */ +export async function getRecentHistory(opts = {}) { + const MS_PER_DAY = 86400000; + const MICROS_PER_MS = 1000; + const DEFAULT_DAYS = 60; + const DEFAULT_MAX_RESULTS = 3000; + + const SEARCH_ENGINE_DOMAINS = [ + "google", + "bing", + "duckduckgo", + "search.brave", + "yahoo", + "startpage", + "ecosia", + "baidu", + "yandex", + ]; + + const days = opts.days ?? DEFAULT_DAYS; + const maxResults = opts.maxResults ?? DEFAULT_MAX_RESULTS; + + // Places stores visit_date in microseconds since epoch. + const cutoffMicros = Math.max( + 0, + (Date.now() - days * MS_PER_DAY) * MICROS_PER_MS + ); + + const isSearchVisit = urlStr => { + try { + const { hostname, pathname, search } = new URL(urlStr); + + const searchEnginePattern = new RegExp( + `(^|\\.)(${SEARCH_ENGINE_DOMAINS.join("|")})\\.`, + "i" + ); + const isSearchEngine = searchEnginePattern.test(hostname); + + const looksLikeSearch = + /search|results|query/i.test(pathname) || + /[?&](q|query|p)=/i.test(search); + + return isSearchEngine && looksLikeSearch; + } catch (e) { + console.error("isSearchVisit: failed to parse URL", { + error: String(e), + urlLength: typeof urlStr === "string" ? urlStr.length : -1, + }); + return false; + } + }; + + const SQL = ` + WITH visit_info AS ( + SELECT + p.id AS place_id, + p.url AS url, + o.host AS host, + p.title AS title, + v.visit_date AS visit_date, + p.frecency AS frecency, + CASE WHEN o.frecency = -1 THEN 1 ELSE o.frecency END AS domain_frecency + FROM moz_places p + JOIN moz_historyvisits v ON v.place_id = p.id + JOIN moz_origins o ON p.origin_id = o.id + WHERE v.visit_date >= :cutoff + AND p.title IS NOT NULL + AND p.frecency IS NOT NULL + ORDER BY v.visit_date DESC + LIMIT :limit + ), + + /* Collapse to one row per place to compute percentiles (like your groupby/place_id mean) */ + per_place AS ( + SELECT + place_id, + MAX(frecency) AS frecency, + MAX(domain_frecency) AS domain_frecency + FROM visit_info + GROUP BY place_id + ), + + /* Percentiles using window function CUME_DIST() */ + per_place_with_pct AS ( + SELECT + place_id, + ROUND(100.0 * CUME_DIST() OVER (ORDER BY frecency), 2) AS frecency_pct, + ROUND(100.0 * CUME_DIST() OVER (ORDER BY domain_frecency), 2) AS domain_frecency_pct + FROM per_place + ) + + /* Final rows: original visits + joined percentiles + source label */ + SELECT + v.url, + v.host, + v.title, + v.visit_date, + p.frecency_pct, + p.domain_frecency_pct + FROM visit_info v + JOIN per_place_with_pct p USING (place_id) + ORDER BY v.visit_date DESC + `; + + try { + const rows = await PlacesUtils.withConnectionWrapper( + "smartwindow-getRecentHistory", + async db => { + const stmt = await db.execute(SQL, { + cutoff: cutoffMicros, + limit: maxResults, + }); + + const out = []; + for (const row of stmt) { + const url = row.getResultByName("url"); + const host = row.getResultByName("host"); + const title = row.getResultByName("title") || ""; + const visitDateMicros = row.getResultByName("visit_date") || 0; + const frequencyPct = row.getResultByName("frecency_pct") || 0; + const domainFrequencyPct = + row.getResultByName("domain_frecency_pct") || 0; + + out.push({ + url, + domain: host, + title, + visitDateMicros, + frequencyPct, + domainFrequencyPct, + source: isSearchVisit(url) ? "search" : "history", + }); + } + return out; + } + ); + return rows; + } catch (error) { + console.error("Failed to fetch Places history via SQL:", error); + return []; + } +} diff --git a/browser/components/aiwindow/models/moz.build b/browser/components/aiwindow/models/moz.build @@ -4,3 +4,11 @@ with Files("**"): BUG_COMPONENT = ("Core", "Machine Learning: General") + +MOZ_SRC_FILES += [ + "InsightsHistorySource.sys.mjs", +] + +XPCSHELL_TESTS_MANIFESTS += [ + "tests/xpcshell/xpcshell.toml", +] diff --git a/browser/components/aiwindow/models/tests/xpcshell/head.js b/browser/components/aiwindow/models/tests/xpcshell/head.js @@ -0,0 +1,17 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* Any shared setup for these tests lives here. */ +const { PlacesUtils } = ChromeUtils.importESModule( + "resource://gre/modules/PlacesUtils.sys.mjs" +); +const { PlacesTestUtils } = ChromeUtils.importESModule( + "resource://testing-common/PlacesTestUtils.sys.mjs" +); + +add_task(async function setup_profile() { + do_get_profile(); // ensure a profile dir (needed by Places) + // Start from a clean history DB + await PlacesUtils.history.clear(); +}); diff --git a/browser/components/aiwindow/models/tests/xpcshell/test_InsightsHistorySource.js b/browser/components/aiwindow/models/tests/xpcshell/test_InsightsHistorySource.js @@ -0,0 +1,187 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +const { getRecentHistory } = ChromeUtils.importESModule( + "moz-src:///browser/components/aiwindow/models/InsightsHistorySource.sys.mjs" +); + +add_task(async function test_basic_history_fetch_and_shape() { + // Seed a few visits spanning search + normal history. + const now = Date.now(); + + const seeded = [ + { + url: "https://www.google.com/search?q=firefox+history", + title: "Google Search: firefox history", + visits: [{ date: new Date(now - 5 * 60 * 1000) }], // 5 min ago + }, + { + url: "https://developer.mozilla.org/en-US/docs/Web/JavaScript", + title: "JavaScript | MDN", + visits: [{ date: new Date(now - 10 * 60 * 1000) }], // 10 min ago + }, + { + url: "https://news.ycombinator.com/", + title: "Hacker News", + visits: [{ date: new Date(now - 15 * 60 * 1000) }], + }, + { + url: "https://search.brave.com/search?q=mozsqlite", + title: "Brave Search: mozsqlite", + visits: [{ date: new Date(now - 20 * 60 * 1000) }], + }, + { + url: "https://mozilla.org/en-US/", + title: "Internet for people, not profit — Mozilla", + visits: [{ date: new Date(now - 25 * 60 * 1000) }], + }, + ]; + + // Insert via high-level API; Places will populate moz_origins/visits. + await PlacesUtils.history.insertMany(seeded); + + const rows = await getRecentHistory({ days: 1, maxResults: 100 }); + Assert.ok(Array.isArray(rows), "Should return an array"); + Assert.greaterOrEqual( + rows.length, + seeded.length, + "Should return at least seeded rows" + ); + + // Verify required fields & types on a sample. + for (const row of rows.slice(0, 5)) { + Assert.strictEqual(typeof row.url, "string", "url is a string"); + Assert.ok(row.url.length, "url present"); + Assert.strictEqual(typeof row.domain, "string", "domain is a string"); + Assert.ok(row.domain.length, "domain present"); + Assert.strictEqual(typeof row.title, "string", "title is a string"); + Assert.ok(typeof row.title.length, "title present"); + Assert.strictEqual( + typeof row.frequencyPct, + "number", + "frequencyPct is a number" + ); + Assert.strictEqual( + typeof row.domainFrequencyPct, + "number", + "domainFrequencyPct is a number" + ); + Assert.ok( + row.source === "search" || row.source === "history", + "source labeled" + ); + Assert.ok( + row.frequencyPct >= 0 && row.frequencyPct <= 100, + "frequencyPct within 0–100" + ); + Assert.ok( + row.domainFrequencyPct >= 0 && row.domainFrequencyPct <= 100, + "domainFrequencyPct within 0–100" + ); + + Assert.strictEqual( + typeof row.visitDateMicros, + "number", + "visitDateMicros is a number" + ); + Assert.ok( + Number.isFinite(row.visitDateMicros), + "visitDateMicros is finite" + ); + Assert.greaterOrEqual( + row.visitDateMicros, + 0, + "visitDateMicros non-negative" + ); + } + + // Check ordering: newest first by visit_date. + const copy = rows.map(r => r.visitDateMicros); + const sorted = [...copy].sort((a, b) => b - a); + Assert.deepEqual( + copy.slice(0, 10), + sorted.slice(0, 10), + "Rows are ordered by visit date desc" + ); + + // Search-source tagging should catch major engines with query paths. + const byUrl = new Map(rows.map(r => [r.url, r])); + Assert.equal( + byUrl.get(seeded[0].url).source, + "search", + "Google search tagged as 'search'" + ); + Assert.equal( + byUrl.get(seeded[3].url).source, + "search", + "Brave search tagged as 'search'" + ); + Assert.equal( + byUrl.get(seeded[1].url).source, + "history", + "MDN should be 'history'" + ); + Assert.equal( + byUrl.get(seeded[2].url).source, + "history", + "Hacker News should be 'history'" + ); + Assert.equal( + byUrl.get(seeded[4].url).source, + "history", + "Internet for people, not profit — Mozilla" + ); +}); + +add_task(async function test_maxResults_is_respected() { + // Create a burst of visits so we can test LIMIT behavior. + await PlacesUtils.history.clear(); + + const base = Date.now(); + const toInsert = []; + for (let i = 0; i < 50; i++) { + toInsert.push({ + url: `https://example.com/page-${i}`, + title: `Example Page ${i}`, + visits: [{ date: new Date(base - i * 1000) }], + }); + } + await PlacesUtils.history.insertMany(toInsert); + + const rows10 = await getRecentHistory({ days: 1, maxResults: 10 }); + Assert.equal(rows10.length, 10, "maxResults=10 respected"); + + const rows5 = await getRecentHistory({ days: 1, maxResults: 5 }); + Assert.equal(rows5.length, 5, "maxResults=5 respected"); +}); + +add_task(async function test_days_cutoff_is_respected() { + await PlacesUtils.history.clear(); + + // One old (2 days), one recent (within 1 hour) + const now = Date.now(); + await PlacesUtils.history.insertMany([ + { + url: "https://old.example.com/", + title: "Old Visit", + visits: [{ date: new Date(now - 2 * 24 * 60 * 60 * 1000) }], + }, + { + url: "https://recent.example.com/", + title: "Recent Visit", + visits: [{ date: new Date(now - 30 * 60 * 1000) }], + }, + ]); + + const rows = await getRecentHistory({ days: 1, maxResults: 50 }); + const urls = rows.map(r => r.url); + Assert.ok( + urls.includes("https://recent.example.com/"), + "Recent visit present" + ); + Assert.ok( + !urls.includes("https://old.example.com/"), + "Old visit filtered by days cutoff" + ); +}); diff --git a/browser/components/aiwindow/models/tests/xpcshell/xpcshell.toml b/browser/components/aiwindow/models/tests/xpcshell/xpcshell.toml @@ -0,0 +1,7 @@ +[DEFAULT] +run-if = ["os != 'android'"] +head = "head.js" +firefox-appdir = "browser" +support-files = [] + +["test_InsightsHistorySource.js"]