MemoriesDriftDetector.sys.mjs (13079B)
1 /* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 */ 5 6 import { PlacesUtils } from "resource://gre/modules/PlacesUtils.sys.mjs"; 7 import { MemoriesManager } from "moz-src:///browser/components/aiwindow/models/memories/MemoriesManager.sys.mjs"; 8 import { sessionizeVisits } from "moz-src:///browser/components/aiwindow/models/memories/MemoriesHistorySource.sys.mjs"; 9 10 import { 11 // How many of the most recent delta sessions to evaluate against thresholds. 12 DRIFT_EVAL_DELTA_COUNT as DEFAULT_EVAL_DELTA_COUNT, 13 // Quantile of baseline scores used as a threshold (e.g. 0.9 => 90th percentile). 14 DRIFT_TRIGGER_QUANTILE as DEFAULT_TRIGGER_QUANTILE, 15 } from "moz-src:///browser/components/aiwindow/models/memories/MemoriesConstants.sys.mjs"; 16 17 /** 18 * @typedef {object} SessionMetric 19 * @property {string|number} sessionId Unique identifier for the session 20 * @property {number} jsScore Jensen–Shannon divergence for the session 21 * @property {number} avgSurprisal Average surprisal for the session 22 * @property {number} [timestampMs] Optional timestamp for debugging 23 */ 24 25 /** 26 * This class detects drift to help decide when to run memories generation. 27 * 28 * High-level flow for history-based drift: 29 * 1. Read last_history_memory_ts via MemoriesManager.getLastHistoryMemoryTimestamp(). 30 * 2. Use a DRIFT_LOOKBACK_DAYS (e.g. 14 days) lookback prior to that timestamp 31 * to define a baseline window, and include all visits from that lookback to "now". 32 * 3. Sessionize visits via sessionizeVisits(). 33 * 4. Split sessions into: 34 * baseline: session_start_ms < last_history_memory_ts 35 * delta: session_start_ms >= last_history_memory_ts 36 * 5. Build a baseline host distribution from baseline sessions. 37 * 6. For BOTH baseline and delta sessions, compute: 38 * - JS divergence vs baseline. 39 * - Average surprisal vs baseline. 40 * 7. Use baseline metrics to derive thresholds (e.g. 0.9 quantile), 41 * and compare recent delta sessions to those thresholds to decide a trigger. 42 */ 43 44 // Lookback period before lastHistoryMemoryTS to define the baseline window. 45 const DRIFT_LOOKBACK_DAYS = 14; 46 // Cap on how many visits to fetch from Places. 47 const DRIFT_HISTORY_LIMIT = 5000; 48 49 const MS_PER_DAY = 24 * 60 * 60 * 1000; 50 const MICROS_PER_MS = 1000; 51 const EPS = 1e-12; 52 53 const DRIFT_HISTORY_SQL = ` 54 SELECT 55 p.id AS place_id, 56 p.url AS url, 57 o.host AS host, 58 p.title AS title, 59 v.visit_date AS visit_date 60 FROM moz_places p 61 JOIN moz_historyvisits v ON v.place_id = p.id 62 JOIN moz_origins o ON p.origin_id = o.id 63 WHERE v.visit_date >= :cutoff 64 AND p.title IS NOT NULL 65 AND p.frecency IS NOT NULL 66 AND o.host IS NOT NULL 67 AND length(o.host) > 0 68 ORDER BY v.visit_date DESC 69 LIMIT :limit 70 `; 71 72 /** 73 * Compute the q-quantile of an array of numbers. 74 * 75 * @param {number[]} values 76 * @param {number} quantile in [0, 1], e.g. 0.9 77 * @returns {number} 78 */ 79 function computeQuantile(values, quantile) { 80 if (!values.length) { 81 return 0; 82 } 83 const sorted = [...values].sort((a, b) => a - b); 84 const pos = (sorted.length - 1) * quantile; 85 const lowerIdx = Math.floor(pos); 86 const upperIdx = Math.ceil(pos); 87 88 if (lowerIdx === upperIdx) { 89 return sorted[lowerIdx]; 90 } 91 const lower = sorted[lowerIdx]; 92 const upper = sorted[upperIdx]; 93 const weight = pos - lowerIdx; 94 return lower + weight * (upper - lower); 95 } 96 97 /** 98 * Compute KL divergence KL(P || Q). 99 * 100 * @param {Map<string, number>} p 101 * @param {Map<string, number>} q 102 * @returns {number} 103 */ 104 function klDiv(p, q) { 105 let sum = 0; 106 for (const [key, pVal] of p.entries()) { 107 if (pVal <= 0) { 108 continue; 109 } 110 const qVal = q.get(key) ?? EPS; 111 const ratio = pVal / qVal; 112 sum += pVal * Math.log(ratio); 113 } 114 return sum; 115 } 116 117 /** 118 * Build a normalized probability distribution (Map) from host to count. 119 * 120 * @param {Map<string, number>} counts 121 * @returns {Map<string, number>} 122 */ 123 function normalizeCounts(counts) { 124 if (!counts.size) { 125 return new Map(); 126 } 127 let total = 0; 128 for (const v of counts.values()) { 129 total += v; 130 } 131 const dist = new Map(); 132 for (const [k, v] of counts.entries()) { 133 dist.set(k, v / Math.max(1, total)); 134 } 135 return dist; 136 } 137 138 /** 139 * Compute Jensen–Shannon divergence between two distributions P and Q. 140 * 141 * P and Q are Maps of host to probability. 142 * 143 * @param {Map<string, number>} p 144 * @param {Map<string, number>} q 145 * @returns {number} 146 */ 147 function jsDivergence(p, q) { 148 if (!p.size || !q.size) { 149 return 0; 150 } 151 const m = new Map(); 152 const allKeys = new Set([...p.keys(), ...q.keys()]); 153 for (const key of allKeys) { 154 const pv = p.get(key) ?? 0; 155 const qv = q.get(key) ?? 0; 156 m.set(key, 0.5 * (pv + qv)); 157 } 158 const klPM = klDiv(p, m); 159 const klQM = klDiv(q, m); 160 return 0.5 * (klPM + klQM); 161 } 162 163 /** 164 * Compute average surprisal of a session under a baseline distribution. 165 * 166 * For each visit host in the session, surprisal = -log2 P_baseline(host). 167 * If a host is unseen, a small epsilon is used. 168 * 169 * @param {string[]} hosts 170 * @param {Map<string, number>} baselineDist 171 * @returns {number} 172 */ 173 function averageSurprisal(hosts, baselineDist) { 174 if (!hosts.length || !baselineDist.size) { 175 return 0; 176 } 177 let sum = 0; 178 for (const host of hosts) { 179 const prob = baselineDist.get(host) ?? EPS; 180 sum += -Math.log2(prob); 181 } 182 return sum / hosts.length; 183 } 184 185 /** 186 * 187 */ 188 export class MemoriesDriftDetector { 189 /** 190 * Convenience helper: compute metrics AND a trigger decision in one call. 191 * 192 * @param {object} [options] 193 * @param {number} [options.triggerQuantile] 194 * @param {number} [options.evalDeltaCount] 195 * @returns {Promise<{ 196 * baselineMetrics: SessionMetric[], 197 * deltaMetrics: SessionMetric[], 198 * trigger: { 199 * jsThreshold: number, 200 * surpriseThreshold: number, 201 * triggered: boolean, 202 * triggeredSessionIds: Array<string|number>, 203 * }, 204 * }>} 205 */ 206 static async computeHistoryDriftAndTrigger(options = {}) { 207 const { baselineMetrics, deltaMetrics } = 208 await this.computeHistoryDriftSessionMetrics(); 209 210 const trigger = this.computeDriftTriggerFromBaseline( 211 baselineMetrics, 212 deltaMetrics, 213 options 214 ); 215 216 return { baselineMetrics, deltaMetrics, trigger }; 217 } 218 219 /** 220 * Build SessionMetric[] for a group of sessions, given a baseline distribution. 221 * 222 * @param {Array<{ sessionId: string|number, hosts: string[], startMs: number }>} sessions 223 * @param {Map<string, number>} baselineDist 224 * @returns {SessionMetric[]} 225 */ 226 static _buildSessionMetricsForGroup(sessions, baselineDist) { 227 const metrics = []; 228 229 for (const sess of sessions) { 230 const sessionHostCounts = new Map(); 231 for (const h of sess.hosts) { 232 sessionHostCounts.set(h, (sessionHostCounts.get(h) ?? 0) + 1); 233 } 234 const sessionDist = normalizeCounts(sessionHostCounts); 235 const jsScore = jsDivergence(sessionDist, baselineDist); 236 const avgSurp = averageSurprisal(sess.hosts, baselineDist); 237 238 metrics.push({ 239 sessionId: sess.sessionId, 240 jsScore, 241 avgSurprisal: avgSurp, 242 timestampMs: sess.startMs, 243 }); 244 } 245 246 metrics.sort((a, b) => (a.timestampMs ?? 0) - (b.timestampMs ?? 0)); 247 return metrics; 248 } 249 250 /** 251 * Trigger computation based on a baseline window and recent delta sessions. 252 * 253 * @param {SessionMetric[]} baselineMetrics 254 * @param {SessionMetric[]} deltaMetrics 255 * @param {object} [options] 256 * @param {number} [options.triggerQuantile=MemoriesDriftDetector.DEFAULT_TRIGGER_QUANTILE] 257 * @param {number} [options.evalDeltaCount=MemoriesDriftDetector.DEFAULT_EVAL_DELTA_COUNT] 258 * @returns {{ 259 * jsThreshold: number, 260 * surpriseThreshold: number, 261 * triggered: boolean, 262 * triggeredSessionIds: Array<string|number>, 263 * }} 264 */ 265 static computeDriftTriggerFromBaseline( 266 baselineMetrics, 267 deltaMetrics, 268 { 269 triggerQuantile = DEFAULT_TRIGGER_QUANTILE, 270 evalDeltaCount = DEFAULT_EVAL_DELTA_COUNT, 271 } = {} 272 ) { 273 if ( 274 !Array.isArray(baselineMetrics) || 275 !baselineMetrics.length || 276 !Array.isArray(deltaMetrics) || 277 !deltaMetrics.length 278 ) { 279 return { 280 jsThreshold: 0, 281 surpriseThreshold: 0, 282 triggered: false, 283 triggeredSessionIds: [], 284 }; 285 } 286 287 const jsBase = baselineMetrics.map(m => m.jsScore ?? 0); 288 const surpBase = baselineMetrics.map(m => m.avgSurprisal ?? 0); 289 290 const jsThreshold = computeQuantile(jsBase, triggerQuantile); 291 const surpriseThreshold = computeQuantile(surpBase, triggerQuantile); 292 293 const evalMetrics = 294 deltaMetrics.length > evalDeltaCount 295 ? deltaMetrics.slice(-evalDeltaCount) 296 : deltaMetrics; 297 298 const triggeredSessionIds = []; 299 for (const m of evalMetrics) { 300 const jsTriggered = (m.jsScore ?? 0) > jsThreshold; 301 const surpTriggered = (m.avgSurprisal ?? 0) > surpriseThreshold; 302 if (jsTriggered || surpTriggered) { 303 triggeredSessionIds.push(m.sessionId); 304 } 305 } 306 307 return { 308 jsThreshold, 309 surpriseThreshold, 310 triggered: !!triggeredSessionIds.length, 311 triggeredSessionIds, 312 }; 313 } 314 315 /** 316 * Compute per-session drift metrics (JS divergence and average surprisal) 317 * for baseline and delta sessions, based on history around the last 318 * history memory timestamp. 319 * 320 * Baseline window: 321 * [last_history_memory_ts - DRIFT_LOOKBACK_DAYS, last_history_memory_ts) 322 * Delta window: 323 * [last_history_memory_ts, now) 324 * 325 * If there is no prior history memory timestamp, or if there is not enough 326 * data to form both baseline and delta, this returns empty arrays. 327 * 328 * @returns {Promise<{ baselineMetrics: SessionMetric[], deltaMetrics: SessionMetric[] }>} 329 */ 330 static async computeHistoryDriftSessionMetrics() { 331 const lastTsMs = await MemoriesManager.getLastHistoryMemoryTimestamp(); 332 if (!lastTsMs) { 333 // No prior memories -> no meaningful baseline yet. 334 return { baselineMetrics: [], deltaMetrics: [] }; 335 } 336 337 const lookbackStartMs = lastTsMs - DRIFT_LOOKBACK_DAYS * MS_PER_DAY; 338 const cutoffMicros = Math.max(0, lookbackStartMs) * MICROS_PER_MS; 339 340 /** @type {Array<{ place_id:number, url:string, host:string, title:string, visit_date:number }>} */ 341 const rows = []; 342 await PlacesUtils.withConnectionWrapper( 343 "MemoriesDriftDetector:computeHistoryDriftSessionMetrics", 344 async db => { 345 const stmt = await db.executeCached(DRIFT_HISTORY_SQL, { 346 cutoff: cutoffMicros, 347 limit: DRIFT_HISTORY_LIMIT, 348 }); 349 for (const row of stmt) { 350 rows.push({ 351 placeId: row.getResultByName("place_id"), 352 url: row.getResultByName("url"), 353 host: row.getResultByName("host"), 354 title: row.getResultByName("title"), 355 visitDateMicros: row.getResultByName("visit_date"), 356 }); 357 } 358 } 359 ); 360 361 if (!rows.length) { 362 return { baselineMetrics: [], deltaMetrics: [] }; 363 } 364 365 // You can tune gapSec if you want shorter / longer sessions using opts = { gapSec: 900 } 366 const sessionized = sessionizeVisits(rows); 367 368 // Build sessions keyed by session_id. 369 /** @type {Map<number, { sessionId: number, hosts: string[], isBaseline: boolean, startMs: number }>} */ 370 const sessions = new Map(); 371 372 for (const row of sessionized) { 373 const sessionId = row.session_id; 374 const startMs = row.session_start_ms; 375 const host = row.host; 376 377 if (!host) { 378 continue; 379 } 380 381 let sess = sessions.get(sessionId); 382 if (!sess) { 383 sess = { 384 sessionId, 385 hosts: [], 386 isBaseline: startMs < lastTsMs, 387 startMs, 388 }; 389 sessions.set(sessionId, sess); 390 } 391 sess.hosts.push(host); 392 } 393 394 const baselineSessions = []; 395 const deltaSessions = []; 396 397 for (const sess of sessions.values()) { 398 if (sess.isBaseline) { 399 baselineSessions.push(sess); 400 } else { 401 deltaSessions.push(sess); 402 } 403 } 404 405 if (!baselineSessions.length || !deltaSessions.length) { 406 return { baselineMetrics: [], deltaMetrics: [] }; 407 } 408 409 // Build baseline host counts. 410 const baselineCounts = new Map(); 411 for (const sess of baselineSessions) { 412 for (const h of sess.hosts) { 413 baselineCounts.set(h, (baselineCounts.get(h) ?? 0) + 1); 414 } 415 } 416 417 const baselineDist = normalizeCounts(baselineCounts); 418 if (!baselineDist.size) { 419 return { baselineMetrics: [], deltaMetrics: [] }; 420 } 421 422 const baselineMetrics = this._buildSessionMetricsForGroup( 423 baselineSessions, 424 baselineDist 425 ); 426 const deltaMetrics = this._buildSessionMetricsForGroup( 427 deltaSessions, 428 baselineDist 429 ); 430 431 return { baselineMetrics, deltaMetrics }; 432 } 433 }