tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

commit f7275521a8cf4f943757f5c5801a4394518bba39
parent 7224ab439de7d989e29e57e28732894da6aab740
Author: Chidam Gopal <cgopal@mozilla.com>
Date:   Thu, 11 Dec 2025 19:57:33 +0000

Bug 2005524 - Insights drift detector for generation from history r=cdipersio,ai-models-reviewers

Differential Revision: https://phabricator.services.mozilla.com/D275998

Diffstat:
Mbrowser/base/content/test/static/browser_all_files_referenced.js | 4++++
Abrowser/components/aiwindow/models/InsightsDriftDetector.sys.mjs | 431+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mbrowser/components/aiwindow/models/moz.build | 1+
Abrowser/components/aiwindow/models/tests/xpcshell/test_InsightsDriftDetector.js | 285+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mbrowser/components/aiwindow/models/tests/xpcshell/xpcshell.toml | 2++
5 files changed, 723 insertions(+), 0 deletions(-)

diff --git a/browser/base/content/test/static/browser_all_files_referenced.js b/browser/base/content/test/static/browser_all_files_referenced.js @@ -360,6 +360,10 @@ var allowlist = [ { file: "moz-src:///browser/components/aiwindow/models/InsightsManager.sys.mjs", }, + // Bug 2005524 - Insights drift detector for generation from history + { + file: "moz-src:///browser/components/aiwindow/models/InsightsDriftDetector.sys.mjs", + }, ]; if (AppConstants.NIGHTLY_BUILD) { diff --git a/browser/components/aiwindow/models/InsightsDriftDetector.sys.mjs b/browser/components/aiwindow/models/InsightsDriftDetector.sys.mjs @@ -0,0 +1,431 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +import { PlacesUtils } from "resource://gre/modules/PlacesUtils.sys.mjs"; +import { InsightsManager } from "moz-src:///browser/components/aiwindow/models/InsightsManager.sys.mjs"; +import { sessionizeVisits } from "moz-src:///browser/components/aiwindow/models/InsightsHistorySource.sys.mjs"; + +/** + * @typedef {object} SessionMetric + * @property {string|number} sessionId Unique identifier for the session + * @property {number} jsScore Jensen–Shannon divergence for the session + * @property {number} avgSurprisal Average surprisal for the session + * @property {number} [timestampMs] Optional timestamp for debugging + */ + +/** + * This class detects drift to help decide when to run insights generation. + * + * High-level flow for history-based drift: + * 1. Read last_history_insight_ts via InsightsManager.getLastHistoryInsightTimestamp(). + * 2. Use a DRIFT_LOOKBACK_DAYS (e.g. 14 days) lookback prior to that timestamp + * to define a baseline window, and include all visits from that lookback to "now". + * 3. Sessionize visits via sessionizeVisits(). + * 4. Split sessions into: + * baseline: session_start_ms < last_history_insight_ts + * delta: session_start_ms >= last_history_insight_ts + * 5. Build a baseline host distribution from baseline sessions. + * 6. For BOTH baseline and delta sessions, compute: + * - JS divergence vs baseline. + * - Average surprisal vs baseline. + * 7. Use baseline metrics to derive thresholds (e.g. 0.9 quantile), + * and compare recent delta sessions to those thresholds to decide a trigger. + */ + +// Quantile of baseline scores used as a threshold (e.g. 0.9 => 90th percentile). +const DEFAULT_TRIGGER_QUANTILE = 0.9; +// Lookback period before lastHistoryInsightTS to define the baseline window. +const DRIFT_LOOKBACK_DAYS = 14; +// Cap on how many visits to fetch from Places. +const DRIFT_HISTORY_LIMIT = 5000; + +// How many of the most recent delta sessions to evaluate against thresholds. +const DEFAULT_EVAL_DELTA_COUNT = 3; + +const MS_PER_DAY = 24 * 60 * 60 * 1000; +const MICROS_PER_MS = 1000; +const EPS = 1e-12; + +const DRIFT_HISTORY_SQL = ` + SELECT + p.id AS place_id, + p.url AS url, + o.host AS host, + p.title AS title, + v.visit_date AS visit_date + FROM moz_places p + JOIN moz_historyvisits v ON v.place_id = p.id + JOIN moz_origins o ON p.origin_id = o.id + WHERE v.visit_date >= :cutoff + AND p.title IS NOT NULL + AND p.frecency IS NOT NULL + AND o.host IS NOT NULL + AND length(o.host) > 0 + ORDER BY v.visit_date DESC + LIMIT :limit +`; + +/** + * Compute the q-quantile of an array of numbers. + * + * @param {number[]} values + * @param {number} quantile in [0, 1], e.g. 0.9 + * @returns {number} + */ +function computeQuantile(values, quantile) { + if (!values.length) { + return 0; + } + const sorted = [...values].sort((a, b) => a - b); + const pos = (sorted.length - 1) * quantile; + const lowerIdx = Math.floor(pos); + const upperIdx = Math.ceil(pos); + + if (lowerIdx === upperIdx) { + return sorted[lowerIdx]; + } + const lower = sorted[lowerIdx]; + const upper = sorted[upperIdx]; + const weight = pos - lowerIdx; + return lower + weight * (upper - lower); +} + +/** + * Compute KL divergence KL(P || Q). + * + * @param {Map<string, number>} p + * @param {Map<string, number>} q + * @returns {number} + */ +function klDiv(p, q) { + let sum = 0; + for (const [key, pVal] of p.entries()) { + if (pVal <= 0) { + continue; + } + const qVal = q.get(key) ?? EPS; + const ratio = pVal / qVal; + sum += pVal * Math.log(ratio); + } + return sum; +} + +/** + * Build a normalized probability distribution (Map) from host to count. + * + * @param {Map<string, number>} counts + * @returns {Map<string, number>} + */ +function normalizeCounts(counts) { + if (!counts.size) { + return new Map(); + } + let total = 0; + for (const v of counts.values()) { + total += v; + } + const dist = new Map(); + for (const [k, v] of counts.entries()) { + dist.set(k, v / Math.max(1, total)); + } + return dist; +} + +/** + * Compute Jensen–Shannon divergence between two distributions P and Q. + * + * P and Q are Maps of host to probability. + * + * @param {Map<string, number>} p + * @param {Map<string, number>} q + * @returns {number} + */ +function jsDivergence(p, q) { + if (!p.size || !q.size) { + return 0; + } + const m = new Map(); + const allKeys = new Set([...p.keys(), ...q.keys()]); + for (const key of allKeys) { + const pv = p.get(key) ?? 0; + const qv = q.get(key) ?? 0; + m.set(key, 0.5 * (pv + qv)); + } + const klPM = klDiv(p, m); + const klQM = klDiv(q, m); + return 0.5 * (klPM + klQM); +} + +/** + * Compute average surprisal of a session under a baseline distribution. + * + * For each visit host in the session, surprisal = -log2 P_baseline(host). + * If a host is unseen, a small epsilon is used. + * + * @param {string[]} hosts + * @param {Map<string, number>} baselineDist + * @returns {number} + */ +function averageSurprisal(hosts, baselineDist) { + if (!hosts.length || !baselineDist.size) { + return 0; + } + let sum = 0; + for (const host of hosts) { + const prob = baselineDist.get(host) ?? EPS; + sum += -Math.log2(prob); + } + return sum / hosts.length; +} + +/** + * + */ +export class InsightsDriftDetector { + /** + * Convenience helper: compute metrics AND a trigger decision in one call. + * + * @param {object} [options] + * @param {number} [options.triggerQuantile] + * @param {number} [options.evalDeltaCount] + * @returns {Promise<{ + * baselineMetrics: SessionMetric[], + * deltaMetrics: SessionMetric[], + * trigger: { + * jsThreshold: number, + * surpriseThreshold: number, + * triggered: boolean, + * triggeredSessionIds: Array<string|number>, + * }, + * }>} + */ + static async computeHistoryDriftAndTrigger(options = {}) { + const { baselineMetrics, deltaMetrics } = + await this.computeHistoryDriftSessionMetrics(); + + const trigger = this.computeDriftTriggerFromBaseline( + baselineMetrics, + deltaMetrics, + options + ); + + return { baselineMetrics, deltaMetrics, trigger }; + } + + /** + * Build SessionMetric[] for a group of sessions, given a baseline distribution. + * + * @param {Array<{ sessionId: string|number, hosts: string[], startMs: number }>} sessions + * @param {Map<string, number>} baselineDist + * @returns {SessionMetric[]} + */ + static _buildSessionMetricsForGroup(sessions, baselineDist) { + const metrics = []; + + for (const sess of sessions) { + const sessionHostCounts = new Map(); + for (const h of sess.hosts) { + sessionHostCounts.set(h, (sessionHostCounts.get(h) ?? 0) + 1); + } + const sessionDist = normalizeCounts(sessionHostCounts); + const jsScore = jsDivergence(sessionDist, baselineDist); + const avgSurp = averageSurprisal(sess.hosts, baselineDist); + + metrics.push({ + sessionId: sess.sessionId, + jsScore, + avgSurprisal: avgSurp, + timestampMs: sess.startMs, + }); + } + + metrics.sort((a, b) => (a.timestampMs ?? 0) - (b.timestampMs ?? 0)); + return metrics; + } + + /** + * Trigger computation based on a baseline window and recent delta sessions. + * + * @param {SessionMetric[]} baselineMetrics + * @param {SessionMetric[]} deltaMetrics + * @param {object} [options] + * @param {number} [options.triggerQuantile=InsightsDriftDetector.DEFAULT_TRIGGER_QUANTILE] + * @param {number} [options.evalDeltaCount=InsightsDriftDetector.DEFAULT_EVAL_DELTA_COUNT] + * @returns {{ + * jsThreshold: number, + * surpriseThreshold: number, + * triggered: boolean, + * triggeredSessionIds: Array<string|number>, + * }} + */ + static computeDriftTriggerFromBaseline( + baselineMetrics, + deltaMetrics, + { + triggerQuantile = DEFAULT_TRIGGER_QUANTILE, + evalDeltaCount = DEFAULT_EVAL_DELTA_COUNT, + } = {} + ) { + if ( + !Array.isArray(baselineMetrics) || + !baselineMetrics.length || + !Array.isArray(deltaMetrics) || + !deltaMetrics.length + ) { + return { + jsThreshold: 0, + surpriseThreshold: 0, + triggered: false, + triggeredSessionIds: [], + }; + } + + const jsBase = baselineMetrics.map(m => m.jsScore ?? 0); + const surpBase = baselineMetrics.map(m => m.avgSurprisal ?? 0); + + const jsThreshold = computeQuantile(jsBase, triggerQuantile); + const surpriseThreshold = computeQuantile(surpBase, triggerQuantile); + + const evalMetrics = + deltaMetrics.length > evalDeltaCount + ? deltaMetrics.slice(-evalDeltaCount) + : deltaMetrics; + + const triggeredSessionIds = []; + for (const m of evalMetrics) { + const jsTriggered = (m.jsScore ?? 0) > jsThreshold; + const surpTriggered = (m.avgSurprisal ?? 0) > surpriseThreshold; + if (jsTriggered || surpTriggered) { + triggeredSessionIds.push(m.sessionId); + } + } + + return { + jsThreshold, + surpriseThreshold, + triggered: !!triggeredSessionIds.length, + triggeredSessionIds, + }; + } + + /** + * Compute per-session drift metrics (JS divergence and average surprisal) + * for baseline and delta sessions, based on history around the last + * history insight timestamp. + * + * Baseline window: + * [last_history_insight_ts - DRIFT_LOOKBACK_DAYS, last_history_insight_ts) + * Delta window: + * [last_history_insight_ts, now) + * + * If there is no prior history insight timestamp, or if there is not enough + * data to form both baseline and delta, this returns empty arrays. + * + * @returns {Promise<{ baselineMetrics: SessionMetric[], deltaMetrics: SessionMetric[] }>} + */ + static async computeHistoryDriftSessionMetrics() { + const lastTsMs = await InsightsManager.getLastHistoryInsightTimestamp(); + if (!lastTsMs) { + // No prior insights -> no meaningful baseline yet. + return { baselineMetrics: [], deltaMetrics: [] }; + } + + const lookbackStartMs = lastTsMs - DRIFT_LOOKBACK_DAYS * MS_PER_DAY; + const cutoffMicros = Math.max(0, lookbackStartMs) * MICROS_PER_MS; + + /** @type {Array<{ place_id:number, url:string, host:string, title:string, visit_date:number }>} */ + const rows = []; + await PlacesUtils.withConnectionWrapper( + "InsightsDriftDetector:computeHistoryDriftSessionMetrics", + async db => { + const stmt = await db.executeCached(DRIFT_HISTORY_SQL, { + cutoff: cutoffMicros, + limit: DRIFT_HISTORY_LIMIT, + }); + for (const row of stmt) { + rows.push({ + placeId: row.getResultByName("place_id"), + url: row.getResultByName("url"), + host: row.getResultByName("host"), + title: row.getResultByName("title"), + visitDateMicros: row.getResultByName("visit_date"), + }); + } + } + ); + + if (!rows.length) { + return { baselineMetrics: [], deltaMetrics: [] }; + } + + // You can tune gapSec if you want shorter / longer sessions using opts = { gapSec: 900 } + const sessionized = sessionizeVisits(rows); + + // Build sessions keyed by session_id. + /** @type {Map<number, { sessionId: number, hosts: string[], isBaseline: boolean, startMs: number }>} */ + const sessions = new Map(); + + for (const row of sessionized) { + const sessionId = row.session_id; + const startMs = row.session_start_ms; + const host = row.host; + + if (!host) { + continue; + } + + let sess = sessions.get(sessionId); + if (!sess) { + sess = { + sessionId, + hosts: [], + isBaseline: startMs < lastTsMs, + startMs, + }; + sessions.set(sessionId, sess); + } + sess.hosts.push(host); + } + + const baselineSessions = []; + const deltaSessions = []; + + for (const sess of sessions.values()) { + if (sess.isBaseline) { + baselineSessions.push(sess); + } else { + deltaSessions.push(sess); + } + } + + if (!baselineSessions.length || !deltaSessions.length) { + return { baselineMetrics: [], deltaMetrics: [] }; + } + + // Build baseline host counts. + const baselineCounts = new Map(); + for (const sess of baselineSessions) { + for (const h of sess.hosts) { + baselineCounts.set(h, (baselineCounts.get(h) ?? 0) + 1); + } + } + + const baselineDist = normalizeCounts(baselineCounts); + if (!baselineDist.size) { + return { baselineMetrics: [], deltaMetrics: [] }; + } + + const baselineMetrics = this._buildSessionMetricsForGroup( + baselineSessions, + baselineDist + ); + const deltaMetrics = this._buildSessionMetricsForGroup( + deltaSessions, + baselineDist + ); + + return { baselineMetrics, deltaMetrics }; + } +} diff --git a/browser/components/aiwindow/models/moz.build b/browser/components/aiwindow/models/moz.build @@ -14,6 +14,7 @@ MOZ_SRC_FILES += [ "ChatUtils.sys.mjs", "Insights.sys.mjs", "InsightsConstants.sys.mjs", + "InsightsDriftDetector.sys.mjs", "InsightsHistorySource.sys.mjs", "InsightsManager.sys.mjs", "InsightsSchemas.sys.mjs", diff --git a/browser/components/aiwindow/models/tests/xpcshell/test_InsightsDriftDetector.js b/browser/components/aiwindow/models/tests/xpcshell/test_InsightsDriftDetector.js @@ -0,0 +1,285 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +"use strict"; + +const { InsightsDriftDetector } = ChromeUtils.importESModule( + "moz-src:///browser/components/aiwindow/models/InsightsDriftDetector.sys.mjs" +); +const { InsightsManager } = ChromeUtils.importESModule( + "moz-src:///browser/components/aiwindow/models/InsightsManager.sys.mjs" +); + +add_task(function test_computeDriftTriggerFromBaseline_no_data() { + const result = InsightsDriftDetector.computeDriftTriggerFromBaseline( + [], + [], + {} + ); + + Assert.ok(!result.triggered, "No data should not trigger"); + Assert.equal(result.jsThreshold, 0, "JS threshold should be 0 with no data"); + Assert.equal( + result.surpriseThreshold, + 0, + "Surprise threshold should be 0 with no data" + ); + Assert.deepEqual( + result.triggeredSessionIds, + [], + "No triggered sessions without data" + ); +}); + +add_task(function test_computeDriftTriggerFromBaseline_triggers_on_delta() { + /** @type {SessionMetric[]} */ + const baselineMetrics = [ + { + sessionId: "b1", + jsScore: 0.05, + avgSurprisal: 2, + timestampMs: 1, + }, + { + sessionId: "b2", + jsScore: 0.08, + avgSurprisal: 2.5, + timestampMs: 2, + }, + { + sessionId: "b3", + jsScore: 0.1, + avgSurprisal: 3, + timestampMs: 3, + }, + ]; + + // Make delta fairly "spiky" so it's above baseline thresholds. + const deltaMetrics = [ + { + sessionId: "d1", + jsScore: 0.5, + avgSurprisal: 6, + timestampMs: 4, + }, + { + sessionId: "d2", + jsScore: 0.6, + avgSurprisal: 7, + timestampMs: 5, + }, + ]; + + const result = InsightsDriftDetector.computeDriftTriggerFromBaseline( + baselineMetrics, + deltaMetrics, + { + triggerQuantile: 0.8, + evalDeltaCount: 2, + } + ); + + Assert.greater(result.jsThreshold, 0, "JS baseline threshold should be > 0"); + Assert.greater( + result.surpriseThreshold, + 0, + "Surprise baseline threshold should be > 0" + ); + Assert.ok(result.triggered, "High delta metrics should trigger drift"); + Assert.deepEqual( + result.triggeredSessionIds.sort(), + ["d1", "d2"], + "Both delta sessions should be flagged as triggered" + ); +}); + +add_task(function test_computeDriftTriggerFromBaseline_no_delta() { + const baselineMetrics = [ + { + sessionId: "b1", + jsScore: 0.1, + avgSurprisal: 3, + timestampMs: 1, + }, + ]; + + const deltaMetrics = []; + + const result = InsightsDriftDetector.computeDriftTriggerFromBaseline( + baselineMetrics, + deltaMetrics, + {} + ); + + Assert.ok(!result.triggered, "No delta metrics should not trigger"); + Assert.equal(result.jsThreshold, 0, "JS threshold should be 0 with no data"); + Assert.equal( + result.surpriseThreshold, + 0, + "Surprise threshold should be 0 with no data" + ); + Assert.deepEqual( + result.triggeredSessionIds, + [], + "No triggered sessions without delta metrics" + ); +}); + +add_task( + function test_computeDriftTriggerFromBaseline_respects_evalDeltaCount() { + const baselineMetrics = [ + { + sessionId: "b1", + jsScore: 0.05, + avgSurprisal: 2, + timestampMs: 1, + }, + { + sessionId: "b2", + jsScore: 0.08, + avgSurprisal: 2.5, + timestampMs: 2, + }, + ]; + + // First delta is "spiky", later ones are normal-ish. + const deltaMetrics = [ + { + sessionId: "d1", + jsScore: 0.7, + avgSurprisal: 8, + timestampMs: 3, + }, + { + sessionId: "d2", + jsScore: 0.06, + avgSurprisal: 2.1, + timestampMs: 4, + }, + { + sessionId: "d3", + jsScore: 0.07, + avgSurprisal: 2.2, + timestampMs: 5, + }, + ]; + + const result = InsightsDriftDetector.computeDriftTriggerFromBaseline( + baselineMetrics, + // only d2 and d3 should be evaluated (setting evalDeltaCount = 2) + deltaMetrics, + { + triggerQuantile: 0.8, + evalDeltaCount: 2, + } + ); + + Assert.ok( + !result.triggered, + "When only the last 2 non-spiky deltas are evaluated, drift should not trigger" + ); + Assert.deepEqual( + result.triggeredSessionIds, + [], + "No delta sessions should be flagged when only low scores are considered" + ); + } +); + +add_task(function test_computeDriftTriggerFromBaseline_non_spiky_no_trigger() { + const baselineMetrics = [ + { + sessionId: "b1", + jsScore: 0.1, + avgSurprisal: 3, + timestampMs: 1, + }, + { + sessionId: "b2", + jsScore: 0.12, + avgSurprisal: 3.2, + timestampMs: 2, + }, + { + sessionId: "b3", + jsScore: 0.11, + avgSurprisal: 3.1, + timestampMs: 3, + }, + ]; + + const deltaMetrics = [ + { + sessionId: "d1", + jsScore: 0.105, + avgSurprisal: 3.05, + timestampMs: 4, + }, + { + sessionId: "d2", + jsScore: 0.115, + avgSurprisal: 3.1, + timestampMs: 5, + }, + ]; + + const result = InsightsDriftDetector.computeDriftTriggerFromBaseline( + baselineMetrics, + deltaMetrics, + { + triggerQuantile: 0.9, + evalDeltaCount: 2, + } + ); + + Assert.ok( + !result.triggered, + "Delta sessions similar to baseline should not trigger drift" + ); + Assert.deepEqual( + result.triggeredSessionIds, + [], + "No sessions should be flagged when deltas match baseline distribution" + ); +}); + +add_task(async function test_computeHistoryDriftAndTrigger_no_prior_insight() { + const originalGetLastHistoryInsightTimestamp = + InsightsManager.getLastHistoryInsightTimestamp; + + // Force "no previous insight" so computeHistoryDriftSessionMetrics bails out. + InsightsManager.getLastHistoryInsightTimestamp = async () => null; + + const result = await InsightsDriftDetector.computeHistoryDriftAndTrigger({}); + + dump(`no_prior_insight result = ${JSON.stringify(result)}\n`); + + Assert.ok( + Array.isArray(result.baselineMetrics), + "baselineMetrics should be an array" + ); + Assert.ok( + Array.isArray(result.deltaMetrics), + "deltaMetrics should be an array" + ); + Assert.equal( + result.baselineMetrics.length, + 0, + "No baseline metrics when there is no prior insight timestamp" + ); + Assert.equal( + result.deltaMetrics.length, + 0, + "No delta metrics when there is no prior insight timestamp" + ); + Assert.ok( + !result.trigger.triggered, + "Trigger should be false when there is no prior insight" + ); + + // Restore original implementation. + InsightsManager.getLastHistoryInsightTimestamp = + originalGetLastHistoryInsightTimestamp; +}); diff --git a/browser/components/aiwindow/models/tests/xpcshell/xpcshell.toml b/browser/components/aiwindow/models/tests/xpcshell/xpcshell.toml @@ -12,6 +12,8 @@ support-files = [] ["test_Insights.js"] +["test_InsightsDriftDetector.js"] + ["test_InsightsHistorySource.js"] ["test_InsightsManager.js"]