tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

commit e35fd26b70a3ef3521085d31d2ca4222c3e9d8e9
parent d0ede5b6f046a7aa21d6293332654679c393a5d5
Author: Chidam Gopal <cgopal@mozilla.com>
Date:   Wed, 26 Nov 2025 21:45:42 +0000

Bug 2002372 - history data preparation for insights r=cdipersio,ai-models-reviewers

prepares input data from history for insights

Differential Revision: https://phabricator.services.mozilla.com/D274039

Diffstat:
Mbrowser/components/aiwindow/models/InsightsHistorySource.sys.mjs | 593+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mbrowser/components/aiwindow/models/tests/xpcshell/test_InsightsHistorySource.js | 689+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------
2 files changed, 1175 insertions(+), 107 deletions(-)

diff --git a/browser/components/aiwindow/models/InsightsHistorySource.sys.mjs b/browser/components/aiwindow/models/InsightsHistorySource.sys.mjs @@ -8,6 +8,46 @@ import { PlacesUtils } from "resource://gre/modules/PlacesUtils.sys.mjs"; +const MS_PER_DAY = 86_400_000; +const MICROS_PER_MS = 1_000; +const MS_PER_SEC = 1_000; +const MICROS_PER_SEC = 1_000_000; +const SECONDS_PER_DAY = 86_400; + +// History fetch defaults +const DEFAULT_DAYS = 60; +const DEFAULT_MAX_RESULTS = 3000; + +// Sessionization defaults +const DEFAULT_GAP_SEC = 900; +const DEFAULT_MAX_SESSION_SEC = 7200; + +// Recency defaults +const DEFAULT_HALFLIFE_DAYS = 14; +const DEFAULT_RECENCY_FLOOR = 0.5; +const DEFAULT_SESSION_WEIGHT = 1.0; + +const SEARCH_ENGINE_DOMAINS = [ + "google", + "bing", + "duckduckgo", + "search.brave", + "yahoo", + "startpage", + "ecosia", + "baidu", + "yandex", +]; + +function escapeRe(s) { + return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); +} + +const SEARCH_ENGINE_PATTERN = new RegExp( + `(^|\\.)(${SEARCH_ENGINE_DOMAINS.map(escapeRe).join("|")})\\.`, + "i" +); + /** * Fetch recent browsing history from Places (SQL), aggregate by URL, * tag "search" vs "history", and filter low-visit URLs. @@ -15,26 +55,17 @@ import { PlacesUtils } from "resource://gre/modules/PlacesUtils.sys.mjs"; * @param {object} opts * @param {number} [opts.days=60] How far back to look * @param {number} [opts.maxResults=3000] Max rows to return (after sort) - * @returns {Promise<Array<{url:string,title:string,domain:string,visit_time:string,visit_count:number,source:'history'|'search'}>>} + * @returns {Promise<Array<{ + * url: string, + * title: string, + * domain: string, + * visitDateMicros: number, + * frequencyPct: number, + * domainFrequencyPct: number, + * source: 'history'|'search' + * }>>} */ export async function getRecentHistory(opts = {}) { - const MS_PER_DAY = 86400000; - const MICROS_PER_MS = 1000; - const DEFAULT_DAYS = 60; - const DEFAULT_MAX_RESULTS = 3000; - - const SEARCH_ENGINE_DOMAINS = [ - "google", - "bing", - "duckduckgo", - "search.brave", - "yahoo", - "startpage", - "ecosia", - "baidu", - "yandex", - ]; - const days = opts.days ?? DEFAULT_DAYS; const maxResults = opts.maxResults ?? DEFAULT_MAX_RESULTS; @@ -47,17 +78,10 @@ export async function getRecentHistory(opts = {}) { const isSearchVisit = urlStr => { try { const { hostname, pathname, search } = new URL(urlStr); - - const searchEnginePattern = new RegExp( - `(^|\\.)(${SEARCH_ENGINE_DOMAINS.join("|")})\\.`, - "i" - ); - const isSearchEngine = searchEnginePattern.test(hostname); - + const isSearchEngine = SEARCH_ENGINE_PATTERN.test(hostname); const looksLikeSearch = /search|results|query/i.test(pathname) || /[?&](q|query|p)=/i.test(search); - return isSearchEngine && looksLikeSearch; } catch (e) { console.error("isSearchVisit: failed to parse URL", { @@ -158,3 +182,520 @@ export async function getRecentHistory(opts = {}) { return []; } } + +/** + * Sessionize visits using a gap and max session length. + * Returns a new array sorted by ascending time and adds: + * - session_id + * - session_start_ms + * - session_start_iso + * + * @param {Array<{visitDateMicros:number,title?:string,domain?:string,frequencyPct?:number,domainFrequencyPct?:number,source?:'history'|'search'}>} rows + * @param {object} [opts] + * @param {number} [opts.gapSec=900] Max allowed gap between consecutive visits in a session (seconds) + * @param {number} [opts.maxSessionSec=7200] Max session duration from first to current visit (seconds) + * @returns {Array} + */ +export function sessionizeVisits(rows, opts = {}) { + const GAP_MS = (opts.gapSec ?? DEFAULT_GAP_SEC) * MS_PER_SEC; + const MAX_SESSION_MS = + (opts.maxSessionSec ?? DEFAULT_MAX_SESSION_SEC) * MS_PER_SEC; + + // Normalize and keep only visits with a valid timestamp + const normalized = rows + // Keep only rows with a valid timestamp + .filter(row => Number.isFinite(row.visitDateMicros)) + .map(row => ({ + ...row, + visitTimeMs: Math.floor(row.visitDateMicros / MICROS_PER_MS), + })) + .sort((a, b) => a.visitTimeMs - b.visitTimeMs); + + let curStartMs = null; + let prevMs = null; + + for (const row of normalized) { + const timeMs = row.visitTimeMs; + + const startNew = + prevMs === null || + timeMs - prevMs > GAP_MS || + timeMs - curStartMs > MAX_SESSION_MS; + + if (startNew) { + curStartMs = timeMs; + } + + row.session_start_ms = curStartMs; + row.session_start_iso = new Date(curStartMs).toISOString(); + row.session_id = curStartMs; + + prevMs = timeMs; + } + + return normalized; +} + +/** + * Build per-session feature records from sessionized rows. + * + * Output record shape: + * { + * session_id: number, + * title_scores: { [title: string]: number }, + * domain_scores: { [domain: string]: number }, + * session_start_time: number | null, // epoch seconds + * session_end_time: number | null, // epoch seconds + * search_events: { + * session_id: number, + * search_count: number, + * search_titles: string[], + * last_searched: number, // epoch micros + * } | {} + * } + * + * @param {Array} rows sessionized visits + * @returns {Array} + */ +export function generateProfileInputs(rows) { + const bySession = new Map(); + for (const row of rows) { + const sessionId = row.session_id; + if (!bySession.has(sessionId)) { + bySession.set(sessionId, []); + } + bySession.get(sessionId).push(row); + } + + // session_id -> { title: frecency_pct } + const titleScoresBySession = {}; + for (const [sessionId, items] of bySession) { + const m = {}; + for (const r of items) { + const title = r.title ?? ""; + const pct = r.frequencyPct; + if (title && isFiniteNumber(pct)) { + m[title] = pct; + } + } + if (Object.keys(m).length) { + titleScoresBySession[sessionId] = m; + } + } + + // session_id -> { domain: domain_frecency_pct } + const domainScoresBySession = {}; + for (const [sessionId, items] of bySession) { + const m = {}; + for (const r of items) { + const domain = r.domain ?? r.host ?? ""; + const pct = r.domainFrequencyPct; + if (domain && isFiniteNumber(pct)) { + m[domain] = pct; + } + } + if (Object.keys(m).length) { + domainScoresBySession[sessionId] = m; + } + } + + // session_id -> { search_count, search_titles (unique), last_searched } + const searchSummaryBySession = {}; + for (const [sessionId, items] of bySession) { + const searchItems = items.filter(r => r.source === "search"); + if (!searchItems.length) { + continue; + } + const search_titles = [ + ...new Set(searchItems.map(r => r.title).filter(Boolean)), + ]; + const last_searched_raw = Math.max( + ...searchItems.map(r => Number(r.visitDateMicros) || 0) + ); + searchSummaryBySession[sessionId] = { + session_id: sessionId, + search_count: searchItems.length, + search_titles, + last_searched: last_searched_raw, + }; + } + + // session start/end times + const sessionTimes = { start_time: {}, end_time: {} }; + for (const [sessionId, items] of bySession) { + const tsList = items + .filter(Number.isFinite) + .map(r => Number(r.visitDateMicros)); + if (tsList.length) { + sessionTimes.start_time[sessionId] = Math.min(...tsList); + sessionTimes.end_time[sessionId] = Math.max(...tsList); + } else { + sessionTimes.start_time[sessionId] = null; + sessionTimes.end_time[sessionId] = null; + } + } + + // final prepared inputs + const preparedInputs = []; + for (const sessionId of bySession.keys()) { + const rawRecord = { + session_id: sessionId, + title_scores: titleScoresBySession[sessionId] || {}, + domain_scores: domainScoresBySession[sessionId] || {}, + session_start_time: normalizeEpochSeconds( + sessionTimes.start_time[sessionId] + ), + session_end_time: normalizeEpochSeconds(sessionTimes.end_time[sessionId]), + search_events: searchSummaryBySession[sessionId] || {}, + }; + const record = {}; + for (const [key, value] of Object.entries(rawRecord)) { + if (value !== undefined) { + record[key] = value; + } + } + preparedInputs.push(record); + } + return preparedInputs; +} + +/** + * Aggregate over sessions into three dictionaries: + * - agg_domains: domain -> { score, last_seen, num_sessions, session_importance } + * - agg_titles: title -> { score, last_seen, num_sessions, session_importance } + * - agg_searches: session_id -> { search_count, search_titles[], last_searched(sec) } + * + * Notes: + * - "last value wins" semantics for scores (matches your Python loop) + * - session_importance ~ (#sessions total / #sessions item appears in), rounded 2dp + * + * @param {Array} preparedInputs + * @returns {[Record<string, any>, Record<string, any>, Record<string, any>]} + */ +export function aggregateSessions(preparedInputs) { + // domain -> { score, last_seen, sessions:Set } + const domainAgg = Object.create(null); + + // title -> { score, last_seen, sessions:Set } + const titleAgg = Object.create(null); + + // sid -> { search_count, search_titles:Set, last_searched } + const searchAgg = Object.create(null); + + const nowSec = Date.now() / 1000; + const totalSessions = preparedInputs.length; + + for (const session of preparedInputs) { + const sessionId = session.session_id; + const startSec = session.session_start_time; + const endSec = session.session_end_time; + const lastSeenSec = endSec ?? startSec ?? nowSec; + + // domains + const domainScores = session.domain_scores || {}; + for (const [domain, scoreVal] of Object.entries(domainScores)) { + const rec = getOrInit(domainAgg, domain, () => ({ + score: 0.0, + last_seen: 0, + sessions: new Set(), + })); + rec.score = Number(scoreVal); // last value wins + rec.last_seen = Math.max(rec.last_seen, lastSeenSec); + rec.sessions.add(sessionId); + } + + // titles + const titleScores = session.title_scores || {}; + for (const [title, scoreVal] of Object.entries(titleScores)) { + const rec = getOrInit(titleAgg, title, () => ({ + score: 0.0, + last_seen: 0, + sessions: new Set(), + })); + rec.score = Number(scoreVal); // last value wins + rec.last_seen = Math.max(rec.last_seen, lastSeenSec); + rec.sessions.add(sessionId); + } + + // searches + const searchEvents = session.search_events || {}; + const { search_count, search_titles, last_searched } = searchEvents; + + const hasSearchContent = + (search_count && search_count > 0) || + (Array.isArray(search_titles) && search_titles.length) || + Number.isFinite(last_searched); + + if (hasSearchContent) { + const rec = getOrInit(searchAgg, sessionId, () => ({ + search_count: 0, + search_titles: new Set(), + last_searched: 0.0, + })); + rec.search_count += Number(search_count || 0); + for (const title of search_titles || []) { + rec.search_titles.add(title); + } + rec.last_searched = Math.max(rec.last_searched, toSeconds(last_searched)); + } + } + + for (const rec of Object.values(domainAgg)) { + const n = rec.sessions.size; + rec.num_sessions = n; + rec.session_importance = n > 0 ? round2(totalSessions / n) : 0.0; + delete rec.sessions; + } + for (const rec of Object.values(titleAgg)) { + const n = rec.sessions.size; + rec.num_sessions = n; + rec.session_importance = n > 0 ? round2(totalSessions / n) : 0.0; + delete rec.sessions; + } + + for (const key of Object.keys(searchAgg)) { + const rec = searchAgg[key]; + rec.search_titles = [...rec.search_titles]; + } + + return [domainAgg, titleAgg, searchAgg]; +} + +/** + * Compute top-k domains, titles, and searches from aggregate structures. + * + * Input shapes: + * aggDomains: { + * [domain: string]: { + * score: number, + * last_seen: number, + * num_sessions: number, + * session_importance: number, + * } + * } + * + * aggTitles: { + * [title: string]: { + * score: number, + * last_seen: number, + * num_sessions: number, + * session_importance: number, + * } + * } + * + * aggSearches: { + * [sessionId: string|number]: { + * search_count: number, + * search_titles: string[], + * last_searched: number, + * } + * } + * + * Output shape: + * [ + * [ [domain, rank], ... ], // domains, length <= kDomains + * [ [title, rank], ... ], // titles, length <= kTitles + * [ { sid, cnt, q, ls, r }, ... ], // searches, length <= kSearches + * ] + * + * @param {{[domain: string]: any}} aggDomains + * @param {{[title: string]: any}} aggTitles + * @param {{[sessionId: string]: any}} aggSearches + * @param {object} [options] + * @param {number} [options.k_domains=30] + * @param {number} [options.k_titles=60] + * @param {number} [options.k_searches=10] + * @param {number} [options.now] Current time; seconds or ms, normalized internally. + */ +export function topkAggregates( + aggDomains, + aggTitles, + aggSearches, + { k_domains = 30, k_titles = 60, k_searches = 10, now = undefined } = {} +) { + // Normalize `now` to epoch seconds. + let nowSec; + if (now == null) { + nowSec = Date.now() / 1000; + } else { + const asNum = Number(now); + // Heuristic: treat 1e12+ as ms, otherwise seconds. + nowSec = asNum > 1e12 ? asNum / MS_PER_SEC : asNum; + } + + // Domains: [{key, rank, num_sessions, last_seen}] + const domainRanked = Object.entries(aggDomains).map(([domain, info]) => { + const score = Number(info.score || 0); + const importance = Number(info.session_importance || 0); + const lastSeen = Number(info.last_seen || 0); + const numSessions = Number(info.num_sessions || 0); + + const rank = withRecency(score, importance, lastSeen, { now: nowSec }); + + return { + key: domain, + rank, + num_sessions: numSessions, + last_seen: lastSeen, + }; + }); + + // Titles: [{key, rank, num_sessions, last_seen}] + const titleRanked = Object.entries(aggTitles).map(([title, info]) => { + const score = Number(info.score || 0); + const importance = Number(info.session_importance || 0); + const lastSeen = Number(info.last_seen || 0); + const numSessions = Number(info.num_sessions || 0); + + const rank = withRecency(score, importance, lastSeen, { now: nowSec }); + + return { + key: title, + rank, + num_sessions: numSessions, + last_seen: lastSeen, + }; + }); + + // Searches: [{sid, cnt, q, ls, rank}] + const searchRanked = Object.entries(aggSearches).map(([sidRaw, info]) => { + const sid = Number.isFinite(Number(sidRaw)) ? Number(sidRaw) : sidRaw; + const count = Number(info.search_count || 0); + // `last_searched` is already seconds (aggregateSessions uses toSeconds). + const lastSearchedSec = Number(info.last_searched || 0); + const titles = Array.isArray(info.search_titles) ? info.search_titles : []; + + const rank = withRecency(count, 1.0, lastSearchedSec, { now: nowSec }); + + return { + sid, + cnt: count, + q: titles, + ls: lastSearchedSec, + rank, + }; + }); + + // Sort with tie-breakers + domainRanked.sort( + (a, b) => + b.rank - a.rank || + b.num_sessions - a.num_sessions || + b.last_seen - a.last_seen + ); + + titleRanked.sort( + (a, b) => + b.rank - a.rank || + b.num_sessions - a.num_sessions || + b.last_seen - a.last_seen + ); + + searchRanked.sort((a, b) => b.rank - a.rank || b.cnt - a.cnt || b.ls - a.ls); + + // Trim and emit compact structures + const domainItems = domainRanked + .slice(0, k_domains) + .map(({ key, rank }) => [key, round2(rank)]); + + const titleItems = titleRanked + .slice(0, k_titles) + .map(({ key, rank }) => [key, round2(rank)]); + + const searchItems = searchRanked + .slice(0, k_searches) + .map(({ sid, cnt, q, ls, rank }) => ({ + sid, + cnt, + q, + ls, + r: round2(rank), + })); + + return [domainItems, titleItems, searchItems]; +} + +/** + * Blend a base score with session importance and a time-based decay. + * + * Intuition: + * rank ≈ score * sessionImportance * sessionWeight * recencyFactor + * + * where recencyFactor is in [floor, 1], decaying over time with a + * half-life in days. + * + * @param {number} score + * Base score (e.g., frecency percentile). + * @param {number} sessionImportance + * Importance derived from how many sessions the item appears in. + * @param {number} lastSeenSec + * Last-seen timestamp (epoch seconds or micros/ms; normalized via toSeconds()). + * @param {object} [options] + * @param {number} [options.halfLifeDays=14] + * Half-life in days for recency decay; smaller → recency matters more. + * @param {number} [options.floor=0.5] + * Minimum recency factor; keeps a base weight even for very old items. + * @param {number} [options.sessionWeight=1.0] + * Additional multiplier on sessionImportance. + * @param {number} [options.now] + * "Now" timestamp (sec/ms/µs); if omitted, Date.now() is used. + * @returns {number} + * Rounded rank score (2 decimal places). + */ +function withRecency( + score, + sessionImportance, + lastSeenSec, + { + halfLifeDays = DEFAULT_HALFLIFE_DAYS, + floor = DEFAULT_RECENCY_FLOOR, + sessionWeight = DEFAULT_SESSION_WEIGHT, + now = undefined, + } = {} +) { + const nowSec = now != null ? toSeconds(now) : Date.now() / 1000; + const lastSec = toSeconds(lastSeenSec); + + const ageDays = Math.max(0, (nowSec - lastSec) / SECONDS_PER_DAY); + const decay = Math.pow(0.5, ageDays / halfLifeDays); + const importanceScore = + Number(score) * (Number(sessionImportance) * Number(sessionWeight)); + + return round2(importanceScore * (floor + (1 - floor) * decay)); +} + +function isFiniteNumber(n) { + return typeof n === "number" && Number.isFinite(n); +} + +/** + * Convert epoch microseconds → integer epoch seconds. + * If value is null/undefined/NaN, returns null. + * + * @param {number} micros + */ +function normalizeEpochSeconds(micros) { + if (!Number.isFinite(micros)) { + return null; + } + return Math.floor(micros / MICROS_PER_SEC); +} + +function toSeconds(epochMicrosOrMs) { + if (!Number.isFinite(epochMicrosOrMs)) { + return 0; + } + const v = Number(epochMicrosOrMs); + return v > 1e13 ? v / MICROS_PER_SEC : v / MS_PER_SEC; +} + +function getOrInit(mapObj, key, initFn) { + if (!(key in mapObj)) { + mapObj[key] = initFn(); + } + return mapObj[key]; +} + +function round2(x) { + return Math.round(Number(x) * 100) / 100; +} diff --git a/browser/components/aiwindow/models/tests/xpcshell/test_InsightsHistorySource.js b/browser/components/aiwindow/models/tests/xpcshell/test_InsightsHistorySource.js @@ -2,41 +2,180 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ -const { getRecentHistory } = ChromeUtils.importESModule( +const { + getRecentHistory, + sessionizeVisits, + generateProfileInputs, + aggregateSessions, + topkAggregates, +} = ChromeUtils.importESModule( "moz-src:///browser/components/aiwindow/models/InsightsHistorySource.sys.mjs" ); -add_task(async function test_basic_history_fetch_and_shape() { - // Seed a few visits spanning search + normal history. - const now = Date.now(); +/** + * Create a single visit object for PlacesUtils.history.insertMany. + * + * @param {string} url + * @param {string} title + * @param {number} baseMs base timestamp in ms + * @param {number} offsetMs offset from base in ms (negative = earlier) + */ +function makeVisit(url, title, baseMs, offsetMs = 0) { + return { + url, + title, + visits: [{ date: new Date(baseMs + offsetMs) }], + }; +} - const seeded = [ +/** + * Build a small, fixed set of synthetic sessionized rows for testing + * generateProfileInputs and aggregateSessions. + * + * Shape matches what generateProfileInputs expects: sessionized rows. + * + * @param {number} [baseMicros] + */ +function makeSyntheticSessionRows(baseMicros = Date.now() * 1000) { + return [ + // Session 1: two history visits + one search { - url: "https://www.google.com/search?q=firefox+history", - title: "Google Search: firefox history", - visits: [{ date: new Date(now - 5 * 60 * 1000) }], // 5 min ago + session_id: 1, + url: "https://example.com/a1", + title: "Example A1", + domain: "example.com", + visitDateMicros: baseMicros, + frequencyPct: 10, + domainFrequencyPct: 20, + source: "history", }, { - url: "https://developer.mozilla.org/en-US/docs/Web/JavaScript", - title: "JavaScript | MDN", - visits: [{ date: new Date(now - 10 * 60 * 1000) }], // 10 min ago + session_id: 1, + url: "https://example.com/a2", + title: "Example A2", + domain: "example.com", + visitDateMicros: baseMicros + 10_000, + frequencyPct: 30, + domainFrequencyPct: 40, + source: "history", }, { - url: "https://news.ycombinator.com/", - title: "Hacker News", - visits: [{ date: new Date(now - 15 * 60 * 1000) }], - }, - { - url: "https://search.brave.com/search?q=mozsqlite", - title: "Brave Search: mozsqlite", - visits: [{ date: new Date(now - 20 * 60 * 1000) }], + session_id: 1, + url: "https://www.google.com/search?q=test", + title: "Google search: test", + domain: "www.google.com", + visitDateMicros: baseMicros + 20_000, + frequencyPct: 50, + domainFrequencyPct: 60, + source: "search", }, + + // Session 2: one visit, no search { - url: "https://mozilla.org/en-US/", - title: "Internet for people, not profit — Mozilla", - visits: [{ date: new Date(now - 25 * 60 * 1000) }], + session_id: 2, + url: "https://mozilla.org/", + title: "Mozilla", + domain: "mozilla.org", + visitDateMicros: baseMicros + 1_000_000, + frequencyPct: 70, + domainFrequencyPct: 80, + source: "history", }, ]; +} + +function assertHistoryRowShape(row, msgPrefix = "") { + const prefix = msgPrefix ? `${msgPrefix}: ` : ""; + + Assert.strictEqual(typeof row.url, "string", `${prefix}url is a string`); + Assert.ok(row.url.length, `${prefix}url present`); + + Assert.strictEqual( + typeof row.domain, + "string", + `${prefix}domain is a string` + ); + Assert.ok(row.domain.length, `${prefix}domain present`); + + Assert.strictEqual(typeof row.title, "string", `${prefix}title is a string`); + Assert.ok(row.title.length, `${prefix}title present`); + + Assert.strictEqual( + typeof row.frequencyPct, + "number", + `${prefix}frequencyPct is a number` + ); + Assert.strictEqual( + typeof row.domainFrequencyPct, + "number", + `${prefix}domainFrequencyPct is a number` + ); + + Assert.ok( + row.source === "search" || row.source === "history", + `${prefix}source labeled` + ); + Assert.ok( + row.frequencyPct >= 0 && row.frequencyPct <= 100, + `${prefix}frequencyPct within 0–100` + ); + Assert.ok( + row.domainFrequencyPct >= 0 && row.domainFrequencyPct <= 100, + `${prefix}domainFrequencyPct within 0–100` + ); + + Assert.strictEqual( + typeof row.visitDateMicros, + "number", + `${prefix}visitDateMicros is a number` + ); + Assert.ok( + Number.isFinite(row.visitDateMicros), + `${prefix}visitDateMicros is finite` + ); + Assert.greaterOrEqual( + row.visitDateMicros, + 0, + `${prefix}visitDateMicros non-negative` + ); +} + +add_task(async function test_basic_history_fetch_and_shape() { + await PlacesUtils.history.clear(); + const now = Date.now(); + + const seeded = [ + makeVisit( + "https://www.google.com/search?q=firefox+history", + "Google Search: firefox history", + now, + -5 * 60 * 1000 + ), + makeVisit( + "https://developer.mozilla.org/en-US/docs/Web/JavaScript", + "JavaScript | MDN", + now, + -10 * 60 * 1000 + ), + makeVisit( + "https://news.ycombinator.com/", + "Hacker News", + now, + -15 * 60 * 1000 + ), + makeVisit( + "https://search.brave.com/search?q=mozsqlite", + "Brave Search: mozsqlite", + now, + -20 * 60 * 1000 + ), + makeVisit( + "https://mozilla.org/en-US/", + "Internet for people, not profit — Mozilla", + now, + -25 * 60 * 1000 + ), + ]; // Insert via high-level API; Places will populate moz_origins/visits. await PlacesUtils.history.insertMany(seeded); @@ -50,50 +189,8 @@ add_task(async function test_basic_history_fetch_and_shape() { ); // Verify required fields & types on a sample. - for (const row of rows.slice(0, 5)) { - Assert.strictEqual(typeof row.url, "string", "url is a string"); - Assert.ok(row.url.length, "url present"); - Assert.strictEqual(typeof row.domain, "string", "domain is a string"); - Assert.ok(row.domain.length, "domain present"); - Assert.strictEqual(typeof row.title, "string", "title is a string"); - Assert.ok(typeof row.title.length, "title present"); - Assert.strictEqual( - typeof row.frequencyPct, - "number", - "frequencyPct is a number" - ); - Assert.strictEqual( - typeof row.domainFrequencyPct, - "number", - "domainFrequencyPct is a number" - ); - Assert.ok( - row.source === "search" || row.source === "history", - "source labeled" - ); - Assert.ok( - row.frequencyPct >= 0 && row.frequencyPct <= 100, - "frequencyPct within 0–100" - ); - Assert.ok( - row.domainFrequencyPct >= 0 && row.domainFrequencyPct <= 100, - "domainFrequencyPct within 0–100" - ); - - Assert.strictEqual( - typeof row.visitDateMicros, - "number", - "visitDateMicros is a number" - ); - Assert.ok( - Number.isFinite(row.visitDateMicros), - "visitDateMicros is finite" - ); - Assert.greaterOrEqual( - row.visitDateMicros, - 0, - "visitDateMicros non-negative" - ); + for (const [idx, row] of rows.slice(0, 5).entries()) { + assertHistoryRowShape(row, `row[${idx}]`); } // Check ordering: newest first by visit_date. @@ -141,11 +238,14 @@ add_task(async function test_maxResults_is_respected() { const base = Date.now(); const toInsert = []; for (let i = 0; i < 50; i++) { - toInsert.push({ - url: `https://example.com/page-${i}`, - title: `Example Page ${i}`, - visits: [{ date: new Date(base - i * 1000) }], - }); + toInsert.push( + makeVisit( + `https://example.com/page-${i}`, + `Example Page ${i}`, + base, + -i * 1000 + ) + ); } await PlacesUtils.history.insertMany(toInsert); @@ -162,16 +262,18 @@ add_task(async function test_days_cutoff_is_respected() { // One old (2 days), one recent (within 1 hour) const now = Date.now(); await PlacesUtils.history.insertMany([ - { - url: "https://old.example.com/", - title: "Old Visit", - visits: [{ date: new Date(now - 2 * 24 * 60 * 60 * 1000) }], - }, - { - url: "https://recent.example.com/", - title: "Recent Visit", - visits: [{ date: new Date(now - 30 * 60 * 1000) }], - }, + makeVisit( + "https://old.example.com/", + "Old Visit", + now, + -2 * 24 * 60 * 60 * 1000 + ), + makeVisit( + "https://recent.example.com/", + "Recent Visit", + now, + -30 * 60 * 1000 + ), ]); const rows = await getRecentHistory({ days: 1, maxResults: 50 }); @@ -185,3 +287,428 @@ add_task(async function test_days_cutoff_is_respected() { "Old visit filtered by days cutoff" ); }); + +add_task(function test_sessionizeVisits_basic() { + const baseMs = Date.now(); + + // 3 visits: + // - v1 at t + // - v2 at t + 1 min (same session) + // - v3 at t + 30 min (new session with default 15 min gap) + const rows = [ + { + url: "https://example.com/1", + title: "First", + domain: "example.com", + visitDateMicros: (baseMs + 1 * 60 * 1000) * 1000, // v2 + }, + { + url: "https://example.com/0", + title: "Zero", + domain: "example.com", + visitDateMicros: baseMs * 1000, // v1 + }, + { + url: "https://example.com/2", + title: "Second", + domain: "example.com", + visitDateMicros: (baseMs + 30 * 60 * 1000) * 1000, // v3 + }, + ]; + + const sessionized = sessionizeVisits(rows); + + Assert.equal(sessionized.length, 3, "All rows kept"); + // Sorted ascending by time + Assert.ok( + sessionized[0].visitDateMicros <= sessionized[1].visitDateMicros && + sessionized[1].visitDateMicros <= sessionized[2].visitDateMicros, + "Sessionized rows sorted by ascending visit time" + ); + + const [r0, r1, r2] = sessionized; + + // First two within 1 minute -> same session_id + Assert.strictEqual( + r0.session_id, + r1.session_id, + "First two visits should share a session" + ); + + // Third 30 min later -> different session_id with default 15 min gap + Assert.notStrictEqual( + r1.session_id, + r2.session_id, + "Third visit should start a new session" + ); + + // Required session fields present + for (const r of sessionized) { + Assert.strictEqual(typeof r.session_id, "number", "session_id is a number"); + Assert.strictEqual( + typeof r.session_start_ms, + "number", + "session_start_ms is a number" + ); + Assert.strictEqual( + typeof r.session_start_iso, + "string", + "session_start_iso is a string" + ); + // session_start_iso should be a valid ISO string matching session_start_ms + const parsed = new Date(r.session_start_iso); + Assert.ok( + Number.isFinite(parsed.getTime()), + "session_start_iso parses as a valid Date" + ); + Assert.equal( + parsed.toISOString(), + r.session_start_iso, + "session_start_iso is in canonical ISO 8601 format" + ); + // Also ensure ms and iso are consistent + const fromMs = new Date(r.session_start_ms).toISOString(); + Assert.equal( + fromMs, + r.session_start_iso, + "session_start_iso matches session_start_ms" + ); + } +}); + +add_task(function test_sessionizeVisits_empty_and_invalid() { + // Empty input -> empty output + let sessionized = sessionizeVisits([]); + Assert.ok(Array.isArray(sessionized), "Empty input returns array"); + Assert.equal(sessionized.length, 0, "Empty input yields empty output"); + + // Non-finite visitDateMicros should be filtered out + const rows = [ + { url: "https://example.com/a", visitDateMicros: NaN }, + { url: "https://example.com/b", visitDateMicros: Infinity }, + { url: "https://example.com/c", visitDateMicros: -Infinity }, + ]; + sessionized = sessionizeVisits(rows); + Assert.equal( + sessionized.length, + 0, + "Rows with non-finite visitDateMicros are filtered" + ); +}); + +add_task(function test_sessionizeVisits_custom_gap() { + const baseMs = Date.now(); + + // Two visits 20 minutes apart. + const rows = [ + { + url: "https://example.com/0", + visitDateMicros: baseMs * 1000, + }, + { + url: "https://example.com/1", + visitDateMicros: (baseMs + 20 * 60 * 1000) * 1000, + }, + ]; + + // With a huge gapSec, they should stay in one session. + const sessionizedLoose = sessionizeVisits(rows, { gapSec: 3600 }); + Assert.equal( + sessionizedLoose[0].session_id, + sessionizedLoose[1].session_id, + "Custom large gap keeps visits in one session" + ); + + // With a tiny gapSec, they should split. + const sessionizedTight = sessionizeVisits(rows, { gapSec: 60 }); + Assert.notStrictEqual( + sessionizedTight[0].session_id, + sessionizedTight[1].session_id, + "Custom small gap splits sessions" + ); +}); + +add_task(function test_generateProfileInputs_shapes() { + const rows = makeSyntheticSessionRows(); + const prepared = generateProfileInputs(rows); + + // session_id set should be preserved + const originalSessionIds = new Set(rows.map(r => r.session_id)); + const preparedSessionIds = new Set(prepared.map(r => r.session_id)); + Assert.deepEqual( + [...preparedSessionIds].sort(), + [...originalSessionIds].sort(), + "generateProfileInputs preserves session_id set" + ); + + Assert.equal(prepared.length, 2, "Two sessions -> two prepared records"); + + const bySession = new Map(prepared.map(r => [r.session_id, r])); + const sess1 = bySession.get(1); + const sess2 = bySession.get(2); + + Assert.ok(sess1, "Session 1 present"); + Assert.ok(sess2, "Session 2 present"); + + // Session 1: has title/domain scores and search_events + Assert.greater( + Object.keys(sess1.title_scores).length, + 0, + "Session 1 has title_scores" + ); + Assert.greater( + Object.keys(sess1.domain_scores).length, + 0, + "Session 1 has domain_scores" + ); + Assert.ok( + sess1.search_events && + typeof sess1.search_events.search_count === "number" && + Array.isArray(sess1.search_events.search_titles), + "Session 1 has search_events summary" + ); + + // Session 2: no search events + Assert.equal( + Object.keys(sess2.search_events).length, + 0, + "Session 2 has empty search_events" + ); + + // Start/end times should be normalized to seconds or null + for (const sess of prepared) { + Assert.ok( + sess.session_start_time === null || + Number.isFinite(sess.session_start_time), + "session_start_time is null or finite" + ); + Assert.ok( + sess.session_end_time === null || Number.isFinite(sess.session_end_time), + "session_end_time is null or finite" + ); + } +}); + +add_task(function test_generateProfileInputs_search_only_and_missing_scores() { + const baseMicros = Date.now() * 1000; + + const rows = [ + // Session 1: search-only, with frequency/domainFrequency missing + { + session_id: 1, + url: "https://www.google.com/search?q=onlysearch", + title: "Google search: onlysearch", + domain: "www.google.com", + visitDateMicros: baseMicros, + source: "search", + // frequencyPct and domainFrequencyPct intentionally omitted + }, + + // Session 2: one history visit with scores + { + session_id: 2, + url: "https://example.com/", + title: "Example", + domain: "example.com", + visitDateMicros: baseMicros + 1000, + frequencyPct: 50, + domainFrequencyPct: 60, + source: "history", + }, + ]; + + const prepared = generateProfileInputs(rows); + const bySession = new Map(prepared.map(r => [r.session_id, r])); + const sess1 = bySession.get(1); + const sess2 = bySession.get(2); + + // Session 1: no scores because frecency fields missing, but has search_events + Assert.deepEqual( + sess1.title_scores, + {}, + "Search-only session without frecency has empty title_scores" + ); + Assert.deepEqual( + sess1.domain_scores, + {}, + "Search-only session without frecency has empty domain_scores" + ); + Assert.ok( + sess1.search_events && + sess1.search_events.search_count === 1 && + Array.isArray(sess1.search_events.search_titles), + "Search-only session still has search_events" + ); + + // Session 2: has scores, but no search_events + Assert.greater( + Object.keys(sess2.title_scores).length, + 0, + "History session has title_scores" + ); + Assert.greater( + Object.keys(sess2.domain_scores).length, + 0, + "History session has domain_scores" + ); + Assert.equal( + Object.keys(sess2.search_events).length, + 0, + "History-only session has empty search_events" + ); +}); + +add_task(function test_aggregateSessions_basic() { + const rows = makeSyntheticSessionRows(); + const preparedInputs = generateProfileInputs(rows); + + const [domainAgg, titleAgg, searchAgg] = aggregateSessions(preparedInputs); + + const preparedSessionIds = new Set(preparedInputs.map(p => p.session_id)); + const searchAggIds = new Set(Object.keys(searchAgg).map(id => Number(id))); + + Assert.ok( + [...searchAggIds].every(id => preparedSessionIds.has(id)), + "searchAgg keys correspond to prepared session_ids" + ); + + // Domains + const domainKeys = Object.keys(domainAgg).sort(); + Assert.deepEqual( + domainKeys, + ["example.com", "mozilla.org", "www.google.com"].sort(), + "Domain aggregate keys as expected" + ); + + const exampleDomain = domainAgg["example.com"]; + Assert.ok(exampleDomain, "example.com aggregate present"); + Assert.equal( + exampleDomain.num_sessions, + 1, + "example.com appears in one session" + ); + Assert.greater( + exampleDomain.session_importance, + 0, + "example.com has session_importance" + ); + Assert.greaterOrEqual( + exampleDomain.last_seen, + 0, + "example.com last_seen is non-negative" + ); + + const mozillaDomain = domainAgg["mozilla.org"]; + Assert.equal( + mozillaDomain.num_sessions, + 1, + "mozilla.org appears in one session" + ); + + const googleDomain = domainAgg["www.google.com"]; + Assert.ok(googleDomain, "www.google.com aggregate present"); + Assert.equal( + googleDomain.num_sessions, + 1, + "www.google.com appears in one session" + ); + + // Titles + Assert.ok( + Object.prototype.hasOwnProperty.call(titleAgg, "Example A1"), + "Title Example A1 aggregated" + ); + Assert.ok( + Object.prototype.hasOwnProperty.call(titleAgg, "Example A2"), + "Title Example A2 aggregated" + ); + + const titleA2 = titleAgg["Example A2"]; + Assert.equal(titleA2.num_sessions, 1, "Example A2 appears in one session"); + + // Searches + Assert.ok( + Object.prototype.hasOwnProperty.call(searchAgg, 1), + "Search aggregate for session 1 present" + ); + Assert.ok( + !Object.prototype.hasOwnProperty.call(searchAgg, 2), + "No search aggregate for session 2" + ); + + const search1 = searchAgg[1]; + Assert.equal(search1.search_count, 1, "search_count aggregated"); + Assert.deepEqual( + search1.search_titles.sort(), + ["Google search: test"].sort(), + "search_titles aggregated and deduplicated" + ); + Assert.greater( + search1.last_searched, + 0, + "last_searched converted to seconds and > 0" + ); +}); + +add_task(function test_aggregateSessions_empty() { + const [domainAgg, titleAgg, searchAgg] = aggregateSessions([]); + + Assert.deepEqual( + Object.keys(domainAgg), + [], + "Empty input -> no domain aggregates" + ); + Assert.deepEqual( + Object.keys(titleAgg), + [], + "Empty input -> no title aggregates" + ); + Assert.deepEqual( + Object.keys(searchAgg), + [], + "Empty input -> no search aggregates" + ); +}); + +add_task(function test_topkAggregates_recency_and_ranking() { + const nowSec = Math.floor(Date.now() / 1000); + + // Two domains: + // - old.com: very old + // - fresh.com: very recent + const aggDomains = { + "old.com": { + score: 100, + last_seen: nowSec - 60 * 60 * 24 * 60, // 60 days ago + num_sessions: 1, + session_importance: 1, + }, + "fresh.com": { + score: 100, + last_seen: nowSec - 60 * 60, // 1 hour ago + num_sessions: 1, + session_importance: 1, + }, + }; + + const [domainItems] = topkAggregates( + aggDomains, + {}, + {}, + { + k_domains: 2, + k_titles: 0, + k_searches: 0, + now: nowSec, + } + ); + + // Expect fresh.com to outrank old.com due to recency decay. + const [firstDomain, secondDomain] = domainItems.map(([key]) => key); + Assert.equal( + firstDomain, + "fresh.com", + "More recent domain outranks older one" + ); + Assert.equal(secondDomain, "old.com", "Older domain comes second"); +});