commit e35fd26b70a3ef3521085d31d2ca4222c3e9d8e9
parent d0ede5b6f046a7aa21d6293332654679c393a5d5
Author: Chidam Gopal <cgopal@mozilla.com>
Date: Wed, 26 Nov 2025 21:45:42 +0000
Bug 2002372 - history data preparation for insights r=cdipersio,ai-models-reviewers
prepares input data from history for insights
Differential Revision: https://phabricator.services.mozilla.com/D274039
Diffstat:
2 files changed, 1175 insertions(+), 107 deletions(-)
diff --git a/browser/components/aiwindow/models/InsightsHistorySource.sys.mjs b/browser/components/aiwindow/models/InsightsHistorySource.sys.mjs
@@ -8,6 +8,46 @@
import { PlacesUtils } from "resource://gre/modules/PlacesUtils.sys.mjs";
+const MS_PER_DAY = 86_400_000;
+const MICROS_PER_MS = 1_000;
+const MS_PER_SEC = 1_000;
+const MICROS_PER_SEC = 1_000_000;
+const SECONDS_PER_DAY = 86_400;
+
+// History fetch defaults
+const DEFAULT_DAYS = 60;
+const DEFAULT_MAX_RESULTS = 3000;
+
+// Sessionization defaults
+const DEFAULT_GAP_SEC = 900;
+const DEFAULT_MAX_SESSION_SEC = 7200;
+
+// Recency defaults
+const DEFAULT_HALFLIFE_DAYS = 14;
+const DEFAULT_RECENCY_FLOOR = 0.5;
+const DEFAULT_SESSION_WEIGHT = 1.0;
+
+const SEARCH_ENGINE_DOMAINS = [
+ "google",
+ "bing",
+ "duckduckgo",
+ "search.brave",
+ "yahoo",
+ "startpage",
+ "ecosia",
+ "baidu",
+ "yandex",
+];
+
+function escapeRe(s) {
+ return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
+}
+
+const SEARCH_ENGINE_PATTERN = new RegExp(
+ `(^|\\.)(${SEARCH_ENGINE_DOMAINS.map(escapeRe).join("|")})\\.`,
+ "i"
+);
+
/**
* Fetch recent browsing history from Places (SQL), aggregate by URL,
* tag "search" vs "history", and filter low-visit URLs.
@@ -15,26 +55,17 @@ import { PlacesUtils } from "resource://gre/modules/PlacesUtils.sys.mjs";
* @param {object} opts
* @param {number} [opts.days=60] How far back to look
* @param {number} [opts.maxResults=3000] Max rows to return (after sort)
- * @returns {Promise<Array<{url:string,title:string,domain:string,visit_time:string,visit_count:number,source:'history'|'search'}>>}
+ * @returns {Promise<Array<{
+ * url: string,
+ * title: string,
+ * domain: string,
+ * visitDateMicros: number,
+ * frequencyPct: number,
+ * domainFrequencyPct: number,
+ * source: 'history'|'search'
+ * }>>}
*/
export async function getRecentHistory(opts = {}) {
- const MS_PER_DAY = 86400000;
- const MICROS_PER_MS = 1000;
- const DEFAULT_DAYS = 60;
- const DEFAULT_MAX_RESULTS = 3000;
-
- const SEARCH_ENGINE_DOMAINS = [
- "google",
- "bing",
- "duckduckgo",
- "search.brave",
- "yahoo",
- "startpage",
- "ecosia",
- "baidu",
- "yandex",
- ];
-
const days = opts.days ?? DEFAULT_DAYS;
const maxResults = opts.maxResults ?? DEFAULT_MAX_RESULTS;
@@ -47,17 +78,10 @@ export async function getRecentHistory(opts = {}) {
const isSearchVisit = urlStr => {
try {
const { hostname, pathname, search } = new URL(urlStr);
-
- const searchEnginePattern = new RegExp(
- `(^|\\.)(${SEARCH_ENGINE_DOMAINS.join("|")})\\.`,
- "i"
- );
- const isSearchEngine = searchEnginePattern.test(hostname);
-
+ const isSearchEngine = SEARCH_ENGINE_PATTERN.test(hostname);
const looksLikeSearch =
/search|results|query/i.test(pathname) ||
/[?&](q|query|p)=/i.test(search);
-
return isSearchEngine && looksLikeSearch;
} catch (e) {
console.error("isSearchVisit: failed to parse URL", {
@@ -158,3 +182,520 @@ export async function getRecentHistory(opts = {}) {
return [];
}
}
+
+/**
+ * Sessionize visits using a gap and max session length.
+ * Returns a new array sorted by ascending time and adds:
+ * - session_id
+ * - session_start_ms
+ * - session_start_iso
+ *
+ * @param {Array<{visitDateMicros:number,title?:string,domain?:string,frequencyPct?:number,domainFrequencyPct?:number,source?:'history'|'search'}>} rows
+ * @param {object} [opts]
+ * @param {number} [opts.gapSec=900] Max allowed gap between consecutive visits in a session (seconds)
+ * @param {number} [opts.maxSessionSec=7200] Max session duration from first to current visit (seconds)
+ * @returns {Array}
+ */
+export function sessionizeVisits(rows, opts = {}) {
+ const GAP_MS = (opts.gapSec ?? DEFAULT_GAP_SEC) * MS_PER_SEC;
+ const MAX_SESSION_MS =
+ (opts.maxSessionSec ?? DEFAULT_MAX_SESSION_SEC) * MS_PER_SEC;
+
+ // Normalize and keep only visits with a valid timestamp
+ const normalized = rows
+ // Keep only rows with a valid timestamp
+ .filter(row => Number.isFinite(row.visitDateMicros))
+ .map(row => ({
+ ...row,
+ visitTimeMs: Math.floor(row.visitDateMicros / MICROS_PER_MS),
+ }))
+ .sort((a, b) => a.visitTimeMs - b.visitTimeMs);
+
+ let curStartMs = null;
+ let prevMs = null;
+
+ for (const row of normalized) {
+ const timeMs = row.visitTimeMs;
+
+ const startNew =
+ prevMs === null ||
+ timeMs - prevMs > GAP_MS ||
+ timeMs - curStartMs > MAX_SESSION_MS;
+
+ if (startNew) {
+ curStartMs = timeMs;
+ }
+
+ row.session_start_ms = curStartMs;
+ row.session_start_iso = new Date(curStartMs).toISOString();
+ row.session_id = curStartMs;
+
+ prevMs = timeMs;
+ }
+
+ return normalized;
+}
+
+/**
+ * Build per-session feature records from sessionized rows.
+ *
+ * Output record shape:
+ * {
+ * session_id: number,
+ * title_scores: { [title: string]: number },
+ * domain_scores: { [domain: string]: number },
+ * session_start_time: number | null, // epoch seconds
+ * session_end_time: number | null, // epoch seconds
+ * search_events: {
+ * session_id: number,
+ * search_count: number,
+ * search_titles: string[],
+ * last_searched: number, // epoch micros
+ * } | {}
+ * }
+ *
+ * @param {Array} rows sessionized visits
+ * @returns {Array}
+ */
+export function generateProfileInputs(rows) {
+ const bySession = new Map();
+ for (const row of rows) {
+ const sessionId = row.session_id;
+ if (!bySession.has(sessionId)) {
+ bySession.set(sessionId, []);
+ }
+ bySession.get(sessionId).push(row);
+ }
+
+ // session_id -> { title: frecency_pct }
+ const titleScoresBySession = {};
+ for (const [sessionId, items] of bySession) {
+ const m = {};
+ for (const r of items) {
+ const title = r.title ?? "";
+ const pct = r.frequencyPct;
+ if (title && isFiniteNumber(pct)) {
+ m[title] = pct;
+ }
+ }
+ if (Object.keys(m).length) {
+ titleScoresBySession[sessionId] = m;
+ }
+ }
+
+ // session_id -> { domain: domain_frecency_pct }
+ const domainScoresBySession = {};
+ for (const [sessionId, items] of bySession) {
+ const m = {};
+ for (const r of items) {
+ const domain = r.domain ?? r.host ?? "";
+ const pct = r.domainFrequencyPct;
+ if (domain && isFiniteNumber(pct)) {
+ m[domain] = pct;
+ }
+ }
+ if (Object.keys(m).length) {
+ domainScoresBySession[sessionId] = m;
+ }
+ }
+
+ // session_id -> { search_count, search_titles (unique), last_searched }
+ const searchSummaryBySession = {};
+ for (const [sessionId, items] of bySession) {
+ const searchItems = items.filter(r => r.source === "search");
+ if (!searchItems.length) {
+ continue;
+ }
+ const search_titles = [
+ ...new Set(searchItems.map(r => r.title).filter(Boolean)),
+ ];
+ const last_searched_raw = Math.max(
+ ...searchItems.map(r => Number(r.visitDateMicros) || 0)
+ );
+ searchSummaryBySession[sessionId] = {
+ session_id: sessionId,
+ search_count: searchItems.length,
+ search_titles,
+ last_searched: last_searched_raw,
+ };
+ }
+
+ // session start/end times
+ const sessionTimes = { start_time: {}, end_time: {} };
+ for (const [sessionId, items] of bySession) {
+ const tsList = items
+ .filter(Number.isFinite)
+ .map(r => Number(r.visitDateMicros));
+ if (tsList.length) {
+ sessionTimes.start_time[sessionId] = Math.min(...tsList);
+ sessionTimes.end_time[sessionId] = Math.max(...tsList);
+ } else {
+ sessionTimes.start_time[sessionId] = null;
+ sessionTimes.end_time[sessionId] = null;
+ }
+ }
+
+ // final prepared inputs
+ const preparedInputs = [];
+ for (const sessionId of bySession.keys()) {
+ const rawRecord = {
+ session_id: sessionId,
+ title_scores: titleScoresBySession[sessionId] || {},
+ domain_scores: domainScoresBySession[sessionId] || {},
+ session_start_time: normalizeEpochSeconds(
+ sessionTimes.start_time[sessionId]
+ ),
+ session_end_time: normalizeEpochSeconds(sessionTimes.end_time[sessionId]),
+ search_events: searchSummaryBySession[sessionId] || {},
+ };
+ const record = {};
+ for (const [key, value] of Object.entries(rawRecord)) {
+ if (value !== undefined) {
+ record[key] = value;
+ }
+ }
+ preparedInputs.push(record);
+ }
+ return preparedInputs;
+}
+
+/**
+ * Aggregate over sessions into three dictionaries:
+ * - agg_domains: domain -> { score, last_seen, num_sessions, session_importance }
+ * - agg_titles: title -> { score, last_seen, num_sessions, session_importance }
+ * - agg_searches: session_id -> { search_count, search_titles[], last_searched(sec) }
+ *
+ * Notes:
+ * - "last value wins" semantics for scores (matches your Python loop)
+ * - session_importance ~ (#sessions total / #sessions item appears in), rounded 2dp
+ *
+ * @param {Array} preparedInputs
+ * @returns {[Record<string, any>, Record<string, any>, Record<string, any>]}
+ */
+export function aggregateSessions(preparedInputs) {
+ // domain -> { score, last_seen, sessions:Set }
+ const domainAgg = Object.create(null);
+
+ // title -> { score, last_seen, sessions:Set }
+ const titleAgg = Object.create(null);
+
+ // sid -> { search_count, search_titles:Set, last_searched }
+ const searchAgg = Object.create(null);
+
+ const nowSec = Date.now() / 1000;
+ const totalSessions = preparedInputs.length;
+
+ for (const session of preparedInputs) {
+ const sessionId = session.session_id;
+ const startSec = session.session_start_time;
+ const endSec = session.session_end_time;
+ const lastSeenSec = endSec ?? startSec ?? nowSec;
+
+ // domains
+ const domainScores = session.domain_scores || {};
+ for (const [domain, scoreVal] of Object.entries(domainScores)) {
+ const rec = getOrInit(domainAgg, domain, () => ({
+ score: 0.0,
+ last_seen: 0,
+ sessions: new Set(),
+ }));
+ rec.score = Number(scoreVal); // last value wins
+ rec.last_seen = Math.max(rec.last_seen, lastSeenSec);
+ rec.sessions.add(sessionId);
+ }
+
+ // titles
+ const titleScores = session.title_scores || {};
+ for (const [title, scoreVal] of Object.entries(titleScores)) {
+ const rec = getOrInit(titleAgg, title, () => ({
+ score: 0.0,
+ last_seen: 0,
+ sessions: new Set(),
+ }));
+ rec.score = Number(scoreVal); // last value wins
+ rec.last_seen = Math.max(rec.last_seen, lastSeenSec);
+ rec.sessions.add(sessionId);
+ }
+
+ // searches
+ const searchEvents = session.search_events || {};
+ const { search_count, search_titles, last_searched } = searchEvents;
+
+ const hasSearchContent =
+ (search_count && search_count > 0) ||
+ (Array.isArray(search_titles) && search_titles.length) ||
+ Number.isFinite(last_searched);
+
+ if (hasSearchContent) {
+ const rec = getOrInit(searchAgg, sessionId, () => ({
+ search_count: 0,
+ search_titles: new Set(),
+ last_searched: 0.0,
+ }));
+ rec.search_count += Number(search_count || 0);
+ for (const title of search_titles || []) {
+ rec.search_titles.add(title);
+ }
+ rec.last_searched = Math.max(rec.last_searched, toSeconds(last_searched));
+ }
+ }
+
+ for (const rec of Object.values(domainAgg)) {
+ const n = rec.sessions.size;
+ rec.num_sessions = n;
+ rec.session_importance = n > 0 ? round2(totalSessions / n) : 0.0;
+ delete rec.sessions;
+ }
+ for (const rec of Object.values(titleAgg)) {
+ const n = rec.sessions.size;
+ rec.num_sessions = n;
+ rec.session_importance = n > 0 ? round2(totalSessions / n) : 0.0;
+ delete rec.sessions;
+ }
+
+ for (const key of Object.keys(searchAgg)) {
+ const rec = searchAgg[key];
+ rec.search_titles = [...rec.search_titles];
+ }
+
+ return [domainAgg, titleAgg, searchAgg];
+}
+
+/**
+ * Compute top-k domains, titles, and searches from aggregate structures.
+ *
+ * Input shapes:
+ * aggDomains: {
+ * [domain: string]: {
+ * score: number,
+ * last_seen: number,
+ * num_sessions: number,
+ * session_importance: number,
+ * }
+ * }
+ *
+ * aggTitles: {
+ * [title: string]: {
+ * score: number,
+ * last_seen: number,
+ * num_sessions: number,
+ * session_importance: number,
+ * }
+ * }
+ *
+ * aggSearches: {
+ * [sessionId: string|number]: {
+ * search_count: number,
+ * search_titles: string[],
+ * last_searched: number,
+ * }
+ * }
+ *
+ * Output shape:
+ * [
+ * [ [domain, rank], ... ], // domains, length <= kDomains
+ * [ [title, rank], ... ], // titles, length <= kTitles
+ * [ { sid, cnt, q, ls, r }, ... ], // searches, length <= kSearches
+ * ]
+ *
+ * @param {{[domain: string]: any}} aggDomains
+ * @param {{[title: string]: any}} aggTitles
+ * @param {{[sessionId: string]: any}} aggSearches
+ * @param {object} [options]
+ * @param {number} [options.k_domains=30]
+ * @param {number} [options.k_titles=60]
+ * @param {number} [options.k_searches=10]
+ * @param {number} [options.now] Current time; seconds or ms, normalized internally.
+ */
+export function topkAggregates(
+ aggDomains,
+ aggTitles,
+ aggSearches,
+ { k_domains = 30, k_titles = 60, k_searches = 10, now = undefined } = {}
+) {
+ // Normalize `now` to epoch seconds.
+ let nowSec;
+ if (now == null) {
+ nowSec = Date.now() / 1000;
+ } else {
+ const asNum = Number(now);
+ // Heuristic: treat 1e12+ as ms, otherwise seconds.
+ nowSec = asNum > 1e12 ? asNum / MS_PER_SEC : asNum;
+ }
+
+ // Domains: [{key, rank, num_sessions, last_seen}]
+ const domainRanked = Object.entries(aggDomains).map(([domain, info]) => {
+ const score = Number(info.score || 0);
+ const importance = Number(info.session_importance || 0);
+ const lastSeen = Number(info.last_seen || 0);
+ const numSessions = Number(info.num_sessions || 0);
+
+ const rank = withRecency(score, importance, lastSeen, { now: nowSec });
+
+ return {
+ key: domain,
+ rank,
+ num_sessions: numSessions,
+ last_seen: lastSeen,
+ };
+ });
+
+ // Titles: [{key, rank, num_sessions, last_seen}]
+ const titleRanked = Object.entries(aggTitles).map(([title, info]) => {
+ const score = Number(info.score || 0);
+ const importance = Number(info.session_importance || 0);
+ const lastSeen = Number(info.last_seen || 0);
+ const numSessions = Number(info.num_sessions || 0);
+
+ const rank = withRecency(score, importance, lastSeen, { now: nowSec });
+
+ return {
+ key: title,
+ rank,
+ num_sessions: numSessions,
+ last_seen: lastSeen,
+ };
+ });
+
+ // Searches: [{sid, cnt, q, ls, rank}]
+ const searchRanked = Object.entries(aggSearches).map(([sidRaw, info]) => {
+ const sid = Number.isFinite(Number(sidRaw)) ? Number(sidRaw) : sidRaw;
+ const count = Number(info.search_count || 0);
+ // `last_searched` is already seconds (aggregateSessions uses toSeconds).
+ const lastSearchedSec = Number(info.last_searched || 0);
+ const titles = Array.isArray(info.search_titles) ? info.search_titles : [];
+
+ const rank = withRecency(count, 1.0, lastSearchedSec, { now: nowSec });
+
+ return {
+ sid,
+ cnt: count,
+ q: titles,
+ ls: lastSearchedSec,
+ rank,
+ };
+ });
+
+ // Sort with tie-breakers
+ domainRanked.sort(
+ (a, b) =>
+ b.rank - a.rank ||
+ b.num_sessions - a.num_sessions ||
+ b.last_seen - a.last_seen
+ );
+
+ titleRanked.sort(
+ (a, b) =>
+ b.rank - a.rank ||
+ b.num_sessions - a.num_sessions ||
+ b.last_seen - a.last_seen
+ );
+
+ searchRanked.sort((a, b) => b.rank - a.rank || b.cnt - a.cnt || b.ls - a.ls);
+
+ // Trim and emit compact structures
+ const domainItems = domainRanked
+ .slice(0, k_domains)
+ .map(({ key, rank }) => [key, round2(rank)]);
+
+ const titleItems = titleRanked
+ .slice(0, k_titles)
+ .map(({ key, rank }) => [key, round2(rank)]);
+
+ const searchItems = searchRanked
+ .slice(0, k_searches)
+ .map(({ sid, cnt, q, ls, rank }) => ({
+ sid,
+ cnt,
+ q,
+ ls,
+ r: round2(rank),
+ }));
+
+ return [domainItems, titleItems, searchItems];
+}
+
+/**
+ * Blend a base score with session importance and a time-based decay.
+ *
+ * Intuition:
+ * rank ≈ score * sessionImportance * sessionWeight * recencyFactor
+ *
+ * where recencyFactor is in [floor, 1], decaying over time with a
+ * half-life in days.
+ *
+ * @param {number} score
+ * Base score (e.g., frecency percentile).
+ * @param {number} sessionImportance
+ * Importance derived from how many sessions the item appears in.
+ * @param {number} lastSeenSec
+ * Last-seen timestamp (epoch seconds or micros/ms; normalized via toSeconds()).
+ * @param {object} [options]
+ * @param {number} [options.halfLifeDays=14]
+ * Half-life in days for recency decay; smaller → recency matters more.
+ * @param {number} [options.floor=0.5]
+ * Minimum recency factor; keeps a base weight even for very old items.
+ * @param {number} [options.sessionWeight=1.0]
+ * Additional multiplier on sessionImportance.
+ * @param {number} [options.now]
+ * "Now" timestamp (sec/ms/µs); if omitted, Date.now() is used.
+ * @returns {number}
+ * Rounded rank score (2 decimal places).
+ */
+function withRecency(
+ score,
+ sessionImportance,
+ lastSeenSec,
+ {
+ halfLifeDays = DEFAULT_HALFLIFE_DAYS,
+ floor = DEFAULT_RECENCY_FLOOR,
+ sessionWeight = DEFAULT_SESSION_WEIGHT,
+ now = undefined,
+ } = {}
+) {
+ const nowSec = now != null ? toSeconds(now) : Date.now() / 1000;
+ const lastSec = toSeconds(lastSeenSec);
+
+ const ageDays = Math.max(0, (nowSec - lastSec) / SECONDS_PER_DAY);
+ const decay = Math.pow(0.5, ageDays / halfLifeDays);
+ const importanceScore =
+ Number(score) * (Number(sessionImportance) * Number(sessionWeight));
+
+ return round2(importanceScore * (floor + (1 - floor) * decay));
+}
+
+function isFiniteNumber(n) {
+ return typeof n === "number" && Number.isFinite(n);
+}
+
+/**
+ * Convert epoch microseconds → integer epoch seconds.
+ * If value is null/undefined/NaN, returns null.
+ *
+ * @param {number} micros
+ */
+function normalizeEpochSeconds(micros) {
+ if (!Number.isFinite(micros)) {
+ return null;
+ }
+ return Math.floor(micros / MICROS_PER_SEC);
+}
+
+function toSeconds(epochMicrosOrMs) {
+ if (!Number.isFinite(epochMicrosOrMs)) {
+ return 0;
+ }
+ const v = Number(epochMicrosOrMs);
+ return v > 1e13 ? v / MICROS_PER_SEC : v / MS_PER_SEC;
+}
+
+function getOrInit(mapObj, key, initFn) {
+ if (!(key in mapObj)) {
+ mapObj[key] = initFn();
+ }
+ return mapObj[key];
+}
+
+function round2(x) {
+ return Math.round(Number(x) * 100) / 100;
+}
diff --git a/browser/components/aiwindow/models/tests/xpcshell/test_InsightsHistorySource.js b/browser/components/aiwindow/models/tests/xpcshell/test_InsightsHistorySource.js
@@ -2,41 +2,180 @@
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-const { getRecentHistory } = ChromeUtils.importESModule(
+const {
+ getRecentHistory,
+ sessionizeVisits,
+ generateProfileInputs,
+ aggregateSessions,
+ topkAggregates,
+} = ChromeUtils.importESModule(
"moz-src:///browser/components/aiwindow/models/InsightsHistorySource.sys.mjs"
);
-add_task(async function test_basic_history_fetch_and_shape() {
- // Seed a few visits spanning search + normal history.
- const now = Date.now();
+/**
+ * Create a single visit object for PlacesUtils.history.insertMany.
+ *
+ * @param {string} url
+ * @param {string} title
+ * @param {number} baseMs base timestamp in ms
+ * @param {number} offsetMs offset from base in ms (negative = earlier)
+ */
+function makeVisit(url, title, baseMs, offsetMs = 0) {
+ return {
+ url,
+ title,
+ visits: [{ date: new Date(baseMs + offsetMs) }],
+ };
+}
- const seeded = [
+/**
+ * Build a small, fixed set of synthetic sessionized rows for testing
+ * generateProfileInputs and aggregateSessions.
+ *
+ * Shape matches what generateProfileInputs expects: sessionized rows.
+ *
+ * @param {number} [baseMicros]
+ */
+function makeSyntheticSessionRows(baseMicros = Date.now() * 1000) {
+ return [
+ // Session 1: two history visits + one search
{
- url: "https://www.google.com/search?q=firefox+history",
- title: "Google Search: firefox history",
- visits: [{ date: new Date(now - 5 * 60 * 1000) }], // 5 min ago
+ session_id: 1,
+ url: "https://example.com/a1",
+ title: "Example A1",
+ domain: "example.com",
+ visitDateMicros: baseMicros,
+ frequencyPct: 10,
+ domainFrequencyPct: 20,
+ source: "history",
},
{
- url: "https://developer.mozilla.org/en-US/docs/Web/JavaScript",
- title: "JavaScript | MDN",
- visits: [{ date: new Date(now - 10 * 60 * 1000) }], // 10 min ago
+ session_id: 1,
+ url: "https://example.com/a2",
+ title: "Example A2",
+ domain: "example.com",
+ visitDateMicros: baseMicros + 10_000,
+ frequencyPct: 30,
+ domainFrequencyPct: 40,
+ source: "history",
},
{
- url: "https://news.ycombinator.com/",
- title: "Hacker News",
- visits: [{ date: new Date(now - 15 * 60 * 1000) }],
- },
- {
- url: "https://search.brave.com/search?q=mozsqlite",
- title: "Brave Search: mozsqlite",
- visits: [{ date: new Date(now - 20 * 60 * 1000) }],
+ session_id: 1,
+ url: "https://www.google.com/search?q=test",
+ title: "Google search: test",
+ domain: "www.google.com",
+ visitDateMicros: baseMicros + 20_000,
+ frequencyPct: 50,
+ domainFrequencyPct: 60,
+ source: "search",
},
+
+ // Session 2: one visit, no search
{
- url: "https://mozilla.org/en-US/",
- title: "Internet for people, not profit — Mozilla",
- visits: [{ date: new Date(now - 25 * 60 * 1000) }],
+ session_id: 2,
+ url: "https://mozilla.org/",
+ title: "Mozilla",
+ domain: "mozilla.org",
+ visitDateMicros: baseMicros + 1_000_000,
+ frequencyPct: 70,
+ domainFrequencyPct: 80,
+ source: "history",
},
];
+}
+
+function assertHistoryRowShape(row, msgPrefix = "") {
+ const prefix = msgPrefix ? `${msgPrefix}: ` : "";
+
+ Assert.strictEqual(typeof row.url, "string", `${prefix}url is a string`);
+ Assert.ok(row.url.length, `${prefix}url present`);
+
+ Assert.strictEqual(
+ typeof row.domain,
+ "string",
+ `${prefix}domain is a string`
+ );
+ Assert.ok(row.domain.length, `${prefix}domain present`);
+
+ Assert.strictEqual(typeof row.title, "string", `${prefix}title is a string`);
+ Assert.ok(row.title.length, `${prefix}title present`);
+
+ Assert.strictEqual(
+ typeof row.frequencyPct,
+ "number",
+ `${prefix}frequencyPct is a number`
+ );
+ Assert.strictEqual(
+ typeof row.domainFrequencyPct,
+ "number",
+ `${prefix}domainFrequencyPct is a number`
+ );
+
+ Assert.ok(
+ row.source === "search" || row.source === "history",
+ `${prefix}source labeled`
+ );
+ Assert.ok(
+ row.frequencyPct >= 0 && row.frequencyPct <= 100,
+ `${prefix}frequencyPct within 0–100`
+ );
+ Assert.ok(
+ row.domainFrequencyPct >= 0 && row.domainFrequencyPct <= 100,
+ `${prefix}domainFrequencyPct within 0–100`
+ );
+
+ Assert.strictEqual(
+ typeof row.visitDateMicros,
+ "number",
+ `${prefix}visitDateMicros is a number`
+ );
+ Assert.ok(
+ Number.isFinite(row.visitDateMicros),
+ `${prefix}visitDateMicros is finite`
+ );
+ Assert.greaterOrEqual(
+ row.visitDateMicros,
+ 0,
+ `${prefix}visitDateMicros non-negative`
+ );
+}
+
+add_task(async function test_basic_history_fetch_and_shape() {
+ await PlacesUtils.history.clear();
+ const now = Date.now();
+
+ const seeded = [
+ makeVisit(
+ "https://www.google.com/search?q=firefox+history",
+ "Google Search: firefox history",
+ now,
+ -5 * 60 * 1000
+ ),
+ makeVisit(
+ "https://developer.mozilla.org/en-US/docs/Web/JavaScript",
+ "JavaScript | MDN",
+ now,
+ -10 * 60 * 1000
+ ),
+ makeVisit(
+ "https://news.ycombinator.com/",
+ "Hacker News",
+ now,
+ -15 * 60 * 1000
+ ),
+ makeVisit(
+ "https://search.brave.com/search?q=mozsqlite",
+ "Brave Search: mozsqlite",
+ now,
+ -20 * 60 * 1000
+ ),
+ makeVisit(
+ "https://mozilla.org/en-US/",
+ "Internet for people, not profit — Mozilla",
+ now,
+ -25 * 60 * 1000
+ ),
+ ];
// Insert via high-level API; Places will populate moz_origins/visits.
await PlacesUtils.history.insertMany(seeded);
@@ -50,50 +189,8 @@ add_task(async function test_basic_history_fetch_and_shape() {
);
// Verify required fields & types on a sample.
- for (const row of rows.slice(0, 5)) {
- Assert.strictEqual(typeof row.url, "string", "url is a string");
- Assert.ok(row.url.length, "url present");
- Assert.strictEqual(typeof row.domain, "string", "domain is a string");
- Assert.ok(row.domain.length, "domain present");
- Assert.strictEqual(typeof row.title, "string", "title is a string");
- Assert.ok(typeof row.title.length, "title present");
- Assert.strictEqual(
- typeof row.frequencyPct,
- "number",
- "frequencyPct is a number"
- );
- Assert.strictEqual(
- typeof row.domainFrequencyPct,
- "number",
- "domainFrequencyPct is a number"
- );
- Assert.ok(
- row.source === "search" || row.source === "history",
- "source labeled"
- );
- Assert.ok(
- row.frequencyPct >= 0 && row.frequencyPct <= 100,
- "frequencyPct within 0–100"
- );
- Assert.ok(
- row.domainFrequencyPct >= 0 && row.domainFrequencyPct <= 100,
- "domainFrequencyPct within 0–100"
- );
-
- Assert.strictEqual(
- typeof row.visitDateMicros,
- "number",
- "visitDateMicros is a number"
- );
- Assert.ok(
- Number.isFinite(row.visitDateMicros),
- "visitDateMicros is finite"
- );
- Assert.greaterOrEqual(
- row.visitDateMicros,
- 0,
- "visitDateMicros non-negative"
- );
+ for (const [idx, row] of rows.slice(0, 5).entries()) {
+ assertHistoryRowShape(row, `row[${idx}]`);
}
// Check ordering: newest first by visit_date.
@@ -141,11 +238,14 @@ add_task(async function test_maxResults_is_respected() {
const base = Date.now();
const toInsert = [];
for (let i = 0; i < 50; i++) {
- toInsert.push({
- url: `https://example.com/page-${i}`,
- title: `Example Page ${i}`,
- visits: [{ date: new Date(base - i * 1000) }],
- });
+ toInsert.push(
+ makeVisit(
+ `https://example.com/page-${i}`,
+ `Example Page ${i}`,
+ base,
+ -i * 1000
+ )
+ );
}
await PlacesUtils.history.insertMany(toInsert);
@@ -162,16 +262,18 @@ add_task(async function test_days_cutoff_is_respected() {
// One old (2 days), one recent (within 1 hour)
const now = Date.now();
await PlacesUtils.history.insertMany([
- {
- url: "https://old.example.com/",
- title: "Old Visit",
- visits: [{ date: new Date(now - 2 * 24 * 60 * 60 * 1000) }],
- },
- {
- url: "https://recent.example.com/",
- title: "Recent Visit",
- visits: [{ date: new Date(now - 30 * 60 * 1000) }],
- },
+ makeVisit(
+ "https://old.example.com/",
+ "Old Visit",
+ now,
+ -2 * 24 * 60 * 60 * 1000
+ ),
+ makeVisit(
+ "https://recent.example.com/",
+ "Recent Visit",
+ now,
+ -30 * 60 * 1000
+ ),
]);
const rows = await getRecentHistory({ days: 1, maxResults: 50 });
@@ -185,3 +287,428 @@ add_task(async function test_days_cutoff_is_respected() {
"Old visit filtered by days cutoff"
);
});
+
+add_task(function test_sessionizeVisits_basic() {
+ const baseMs = Date.now();
+
+ // 3 visits:
+ // - v1 at t
+ // - v2 at t + 1 min (same session)
+ // - v3 at t + 30 min (new session with default 15 min gap)
+ const rows = [
+ {
+ url: "https://example.com/1",
+ title: "First",
+ domain: "example.com",
+ visitDateMicros: (baseMs + 1 * 60 * 1000) * 1000, // v2
+ },
+ {
+ url: "https://example.com/0",
+ title: "Zero",
+ domain: "example.com",
+ visitDateMicros: baseMs * 1000, // v1
+ },
+ {
+ url: "https://example.com/2",
+ title: "Second",
+ domain: "example.com",
+ visitDateMicros: (baseMs + 30 * 60 * 1000) * 1000, // v3
+ },
+ ];
+
+ const sessionized = sessionizeVisits(rows);
+
+ Assert.equal(sessionized.length, 3, "All rows kept");
+ // Sorted ascending by time
+ Assert.ok(
+ sessionized[0].visitDateMicros <= sessionized[1].visitDateMicros &&
+ sessionized[1].visitDateMicros <= sessionized[2].visitDateMicros,
+ "Sessionized rows sorted by ascending visit time"
+ );
+
+ const [r0, r1, r2] = sessionized;
+
+ // First two within 1 minute -> same session_id
+ Assert.strictEqual(
+ r0.session_id,
+ r1.session_id,
+ "First two visits should share a session"
+ );
+
+ // Third 30 min later -> different session_id with default 15 min gap
+ Assert.notStrictEqual(
+ r1.session_id,
+ r2.session_id,
+ "Third visit should start a new session"
+ );
+
+ // Required session fields present
+ for (const r of sessionized) {
+ Assert.strictEqual(typeof r.session_id, "number", "session_id is a number");
+ Assert.strictEqual(
+ typeof r.session_start_ms,
+ "number",
+ "session_start_ms is a number"
+ );
+ Assert.strictEqual(
+ typeof r.session_start_iso,
+ "string",
+ "session_start_iso is a string"
+ );
+ // session_start_iso should be a valid ISO string matching session_start_ms
+ const parsed = new Date(r.session_start_iso);
+ Assert.ok(
+ Number.isFinite(parsed.getTime()),
+ "session_start_iso parses as a valid Date"
+ );
+ Assert.equal(
+ parsed.toISOString(),
+ r.session_start_iso,
+ "session_start_iso is in canonical ISO 8601 format"
+ );
+ // Also ensure ms and iso are consistent
+ const fromMs = new Date(r.session_start_ms).toISOString();
+ Assert.equal(
+ fromMs,
+ r.session_start_iso,
+ "session_start_iso matches session_start_ms"
+ );
+ }
+});
+
+add_task(function test_sessionizeVisits_empty_and_invalid() {
+ // Empty input -> empty output
+ let sessionized = sessionizeVisits([]);
+ Assert.ok(Array.isArray(sessionized), "Empty input returns array");
+ Assert.equal(sessionized.length, 0, "Empty input yields empty output");
+
+ // Non-finite visitDateMicros should be filtered out
+ const rows = [
+ { url: "https://example.com/a", visitDateMicros: NaN },
+ { url: "https://example.com/b", visitDateMicros: Infinity },
+ { url: "https://example.com/c", visitDateMicros: -Infinity },
+ ];
+ sessionized = sessionizeVisits(rows);
+ Assert.equal(
+ sessionized.length,
+ 0,
+ "Rows with non-finite visitDateMicros are filtered"
+ );
+});
+
+add_task(function test_sessionizeVisits_custom_gap() {
+ const baseMs = Date.now();
+
+ // Two visits 20 minutes apart.
+ const rows = [
+ {
+ url: "https://example.com/0",
+ visitDateMicros: baseMs * 1000,
+ },
+ {
+ url: "https://example.com/1",
+ visitDateMicros: (baseMs + 20 * 60 * 1000) * 1000,
+ },
+ ];
+
+ // With a huge gapSec, they should stay in one session.
+ const sessionizedLoose = sessionizeVisits(rows, { gapSec: 3600 });
+ Assert.equal(
+ sessionizedLoose[0].session_id,
+ sessionizedLoose[1].session_id,
+ "Custom large gap keeps visits in one session"
+ );
+
+ // With a tiny gapSec, they should split.
+ const sessionizedTight = sessionizeVisits(rows, { gapSec: 60 });
+ Assert.notStrictEqual(
+ sessionizedTight[0].session_id,
+ sessionizedTight[1].session_id,
+ "Custom small gap splits sessions"
+ );
+});
+
+add_task(function test_generateProfileInputs_shapes() {
+ const rows = makeSyntheticSessionRows();
+ const prepared = generateProfileInputs(rows);
+
+ // session_id set should be preserved
+ const originalSessionIds = new Set(rows.map(r => r.session_id));
+ const preparedSessionIds = new Set(prepared.map(r => r.session_id));
+ Assert.deepEqual(
+ [...preparedSessionIds].sort(),
+ [...originalSessionIds].sort(),
+ "generateProfileInputs preserves session_id set"
+ );
+
+ Assert.equal(prepared.length, 2, "Two sessions -> two prepared records");
+
+ const bySession = new Map(prepared.map(r => [r.session_id, r]));
+ const sess1 = bySession.get(1);
+ const sess2 = bySession.get(2);
+
+ Assert.ok(sess1, "Session 1 present");
+ Assert.ok(sess2, "Session 2 present");
+
+ // Session 1: has title/domain scores and search_events
+ Assert.greater(
+ Object.keys(sess1.title_scores).length,
+ 0,
+ "Session 1 has title_scores"
+ );
+ Assert.greater(
+ Object.keys(sess1.domain_scores).length,
+ 0,
+ "Session 1 has domain_scores"
+ );
+ Assert.ok(
+ sess1.search_events &&
+ typeof sess1.search_events.search_count === "number" &&
+ Array.isArray(sess1.search_events.search_titles),
+ "Session 1 has search_events summary"
+ );
+
+ // Session 2: no search events
+ Assert.equal(
+ Object.keys(sess2.search_events).length,
+ 0,
+ "Session 2 has empty search_events"
+ );
+
+ // Start/end times should be normalized to seconds or null
+ for (const sess of prepared) {
+ Assert.ok(
+ sess.session_start_time === null ||
+ Number.isFinite(sess.session_start_time),
+ "session_start_time is null or finite"
+ );
+ Assert.ok(
+ sess.session_end_time === null || Number.isFinite(sess.session_end_time),
+ "session_end_time is null or finite"
+ );
+ }
+});
+
+add_task(function test_generateProfileInputs_search_only_and_missing_scores() {
+ const baseMicros = Date.now() * 1000;
+
+ const rows = [
+ // Session 1: search-only, with frequency/domainFrequency missing
+ {
+ session_id: 1,
+ url: "https://www.google.com/search?q=onlysearch",
+ title: "Google search: onlysearch",
+ domain: "www.google.com",
+ visitDateMicros: baseMicros,
+ source: "search",
+ // frequencyPct and domainFrequencyPct intentionally omitted
+ },
+
+ // Session 2: one history visit with scores
+ {
+ session_id: 2,
+ url: "https://example.com/",
+ title: "Example",
+ domain: "example.com",
+ visitDateMicros: baseMicros + 1000,
+ frequencyPct: 50,
+ domainFrequencyPct: 60,
+ source: "history",
+ },
+ ];
+
+ const prepared = generateProfileInputs(rows);
+ const bySession = new Map(prepared.map(r => [r.session_id, r]));
+ const sess1 = bySession.get(1);
+ const sess2 = bySession.get(2);
+
+ // Session 1: no scores because frecency fields missing, but has search_events
+ Assert.deepEqual(
+ sess1.title_scores,
+ {},
+ "Search-only session without frecency has empty title_scores"
+ );
+ Assert.deepEqual(
+ sess1.domain_scores,
+ {},
+ "Search-only session without frecency has empty domain_scores"
+ );
+ Assert.ok(
+ sess1.search_events &&
+ sess1.search_events.search_count === 1 &&
+ Array.isArray(sess1.search_events.search_titles),
+ "Search-only session still has search_events"
+ );
+
+ // Session 2: has scores, but no search_events
+ Assert.greater(
+ Object.keys(sess2.title_scores).length,
+ 0,
+ "History session has title_scores"
+ );
+ Assert.greater(
+ Object.keys(sess2.domain_scores).length,
+ 0,
+ "History session has domain_scores"
+ );
+ Assert.equal(
+ Object.keys(sess2.search_events).length,
+ 0,
+ "History-only session has empty search_events"
+ );
+});
+
+add_task(function test_aggregateSessions_basic() {
+ const rows = makeSyntheticSessionRows();
+ const preparedInputs = generateProfileInputs(rows);
+
+ const [domainAgg, titleAgg, searchAgg] = aggregateSessions(preparedInputs);
+
+ const preparedSessionIds = new Set(preparedInputs.map(p => p.session_id));
+ const searchAggIds = new Set(Object.keys(searchAgg).map(id => Number(id)));
+
+ Assert.ok(
+ [...searchAggIds].every(id => preparedSessionIds.has(id)),
+ "searchAgg keys correspond to prepared session_ids"
+ );
+
+ // Domains
+ const domainKeys = Object.keys(domainAgg).sort();
+ Assert.deepEqual(
+ domainKeys,
+ ["example.com", "mozilla.org", "www.google.com"].sort(),
+ "Domain aggregate keys as expected"
+ );
+
+ const exampleDomain = domainAgg["example.com"];
+ Assert.ok(exampleDomain, "example.com aggregate present");
+ Assert.equal(
+ exampleDomain.num_sessions,
+ 1,
+ "example.com appears in one session"
+ );
+ Assert.greater(
+ exampleDomain.session_importance,
+ 0,
+ "example.com has session_importance"
+ );
+ Assert.greaterOrEqual(
+ exampleDomain.last_seen,
+ 0,
+ "example.com last_seen is non-negative"
+ );
+
+ const mozillaDomain = domainAgg["mozilla.org"];
+ Assert.equal(
+ mozillaDomain.num_sessions,
+ 1,
+ "mozilla.org appears in one session"
+ );
+
+ const googleDomain = domainAgg["www.google.com"];
+ Assert.ok(googleDomain, "www.google.com aggregate present");
+ Assert.equal(
+ googleDomain.num_sessions,
+ 1,
+ "www.google.com appears in one session"
+ );
+
+ // Titles
+ Assert.ok(
+ Object.prototype.hasOwnProperty.call(titleAgg, "Example A1"),
+ "Title Example A1 aggregated"
+ );
+ Assert.ok(
+ Object.prototype.hasOwnProperty.call(titleAgg, "Example A2"),
+ "Title Example A2 aggregated"
+ );
+
+ const titleA2 = titleAgg["Example A2"];
+ Assert.equal(titleA2.num_sessions, 1, "Example A2 appears in one session");
+
+ // Searches
+ Assert.ok(
+ Object.prototype.hasOwnProperty.call(searchAgg, 1),
+ "Search aggregate for session 1 present"
+ );
+ Assert.ok(
+ !Object.prototype.hasOwnProperty.call(searchAgg, 2),
+ "No search aggregate for session 2"
+ );
+
+ const search1 = searchAgg[1];
+ Assert.equal(search1.search_count, 1, "search_count aggregated");
+ Assert.deepEqual(
+ search1.search_titles.sort(),
+ ["Google search: test"].sort(),
+ "search_titles aggregated and deduplicated"
+ );
+ Assert.greater(
+ search1.last_searched,
+ 0,
+ "last_searched converted to seconds and > 0"
+ );
+});
+
+add_task(function test_aggregateSessions_empty() {
+ const [domainAgg, titleAgg, searchAgg] = aggregateSessions([]);
+
+ Assert.deepEqual(
+ Object.keys(domainAgg),
+ [],
+ "Empty input -> no domain aggregates"
+ );
+ Assert.deepEqual(
+ Object.keys(titleAgg),
+ [],
+ "Empty input -> no title aggregates"
+ );
+ Assert.deepEqual(
+ Object.keys(searchAgg),
+ [],
+ "Empty input -> no search aggregates"
+ );
+});
+
+add_task(function test_topkAggregates_recency_and_ranking() {
+ const nowSec = Math.floor(Date.now() / 1000);
+
+ // Two domains:
+ // - old.com: very old
+ // - fresh.com: very recent
+ const aggDomains = {
+ "old.com": {
+ score: 100,
+ last_seen: nowSec - 60 * 60 * 24 * 60, // 60 days ago
+ num_sessions: 1,
+ session_importance: 1,
+ },
+ "fresh.com": {
+ score: 100,
+ last_seen: nowSec - 60 * 60, // 1 hour ago
+ num_sessions: 1,
+ session_importance: 1,
+ },
+ };
+
+ const [domainItems] = topkAggregates(
+ aggDomains,
+ {},
+ {},
+ {
+ k_domains: 2,
+ k_titles: 0,
+ k_searches: 0,
+ now: nowSec,
+ }
+ );
+
+ // Expect fresh.com to outrank old.com due to recency decay.
+ const [firstDomain, secondDomain] = domainItems.map(([key]) => key);
+ Assert.equal(
+ firstDomain,
+ "fresh.com",
+ "More recent domain outranks older one"
+ );
+ Assert.equal(secondDomain, "old.com", "Older domain comes second");
+});