tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

MemoriesDriftDetector.sys.mjs (13079B)


      1 /* This Source Code Form is subject to the terms of the Mozilla Public
      2 * License, v. 2.0. If a copy of the MPL was not distributed with this
      3 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
      4 */
      5 
      6 import { PlacesUtils } from "resource://gre/modules/PlacesUtils.sys.mjs";
      7 import { MemoriesManager } from "moz-src:///browser/components/aiwindow/models/memories/MemoriesManager.sys.mjs";
      8 import { sessionizeVisits } from "moz-src:///browser/components/aiwindow/models/memories/MemoriesHistorySource.sys.mjs";
      9 
     10 import {
     11  // How many of the most recent delta sessions to evaluate against thresholds.
     12  DRIFT_EVAL_DELTA_COUNT as DEFAULT_EVAL_DELTA_COUNT,
     13  // Quantile of baseline scores used as a threshold (e.g. 0.9 => 90th percentile).
     14  DRIFT_TRIGGER_QUANTILE as DEFAULT_TRIGGER_QUANTILE,
     15 } from "moz-src:///browser/components/aiwindow/models/memories/MemoriesConstants.sys.mjs";
     16 
     17 /**
     18 * @typedef {object} SessionMetric
     19 * @property {string|number} sessionId  Unique identifier for the session
     20 * @property {number} jsScore          Jensen–Shannon divergence for the session
     21 * @property {number} avgSurprisal     Average surprisal for the session
     22 * @property {number} [timestampMs]    Optional timestamp for debugging
     23 */
     24 
     25 /**
     26 * This class detects drift to help decide when to run memories generation.
     27 *
     28 * High-level flow for history-based drift:
     29 *  1. Read last_history_memory_ts via MemoriesManager.getLastHistoryMemoryTimestamp().
     30 *  2. Use a DRIFT_LOOKBACK_DAYS (e.g. 14 days) lookback prior to that timestamp
     31 *     to define a baseline window, and include all visits from that lookback to "now".
     32 *  3. Sessionize visits via sessionizeVisits().
     33 *  4. Split sessions into:
     34 *        baseline: session_start_ms < last_history_memory_ts
     35 *        delta:    session_start_ms >= last_history_memory_ts
     36 *  5. Build a baseline host distribution from baseline sessions.
     37 *  6. For BOTH baseline and delta sessions, compute:
     38 *        - JS divergence vs baseline.
     39 *        - Average surprisal vs baseline.
     40 *  7. Use baseline metrics to derive thresholds (e.g. 0.9 quantile),
     41 *     and compare recent delta sessions to those thresholds to decide a trigger.
     42 */
     43 
     44 // Lookback period before lastHistoryMemoryTS to define the baseline window.
     45 const DRIFT_LOOKBACK_DAYS = 14;
     46 // Cap on how many visits to fetch from Places.
     47 const DRIFT_HISTORY_LIMIT = 5000;
     48 
     49 const MS_PER_DAY = 24 * 60 * 60 * 1000;
     50 const MICROS_PER_MS = 1000;
     51 const EPS = 1e-12;
     52 
     53 const DRIFT_HISTORY_SQL = `
     54  SELECT
     55    p.id         AS place_id,
     56    p.url        AS url,
     57    o.host       AS host,
     58    p.title      AS title,
     59    v.visit_date AS visit_date
     60  FROM moz_places p
     61  JOIN moz_historyvisits v ON v.place_id = p.id
     62  JOIN moz_origins o       ON p.origin_id = o.id
     63  WHERE v.visit_date >= :cutoff
     64    AND p.title IS NOT NULL
     65    AND p.frecency IS NOT NULL
     66    AND o.host IS NOT NULL
     67    AND length(o.host) > 0
     68  ORDER BY v.visit_date DESC
     69  LIMIT :limit
     70 `;
     71 
     72 /**
     73 * Compute the q-quantile of an array of numbers.
     74 *
     75 * @param {number[]} values
     76 * @param {number} quantile in [0, 1], e.g. 0.9
     77 * @returns {number}
     78 */
     79 function computeQuantile(values, quantile) {
     80  if (!values.length) {
     81    return 0;
     82  }
     83  const sorted = [...values].sort((a, b) => a - b);
     84  const pos = (sorted.length - 1) * quantile;
     85  const lowerIdx = Math.floor(pos);
     86  const upperIdx = Math.ceil(pos);
     87 
     88  if (lowerIdx === upperIdx) {
     89    return sorted[lowerIdx];
     90  }
     91  const lower = sorted[lowerIdx];
     92  const upper = sorted[upperIdx];
     93  const weight = pos - lowerIdx;
     94  return lower + weight * (upper - lower);
     95 }
     96 
     97 /**
     98 * Compute KL divergence KL(P || Q).
     99 *
    100 * @param {Map<string, number>} p
    101 * @param {Map<string, number>} q
    102 * @returns {number}
    103 */
    104 function klDiv(p, q) {
    105  let sum = 0;
    106  for (const [key, pVal] of p.entries()) {
    107    if (pVal <= 0) {
    108      continue;
    109    }
    110    const qVal = q.get(key) ?? EPS;
    111    const ratio = pVal / qVal;
    112    sum += pVal * Math.log(ratio);
    113  }
    114  return sum;
    115 }
    116 
    117 /**
    118 * Build a normalized probability distribution (Map) from host to count.
    119 *
    120 * @param {Map<string, number>} counts
    121 * @returns {Map<string, number>}
    122 */
    123 function normalizeCounts(counts) {
    124  if (!counts.size) {
    125    return new Map();
    126  }
    127  let total = 0;
    128  for (const v of counts.values()) {
    129    total += v;
    130  }
    131  const dist = new Map();
    132  for (const [k, v] of counts.entries()) {
    133    dist.set(k, v / Math.max(1, total));
    134  }
    135  return dist;
    136 }
    137 
    138 /**
    139 * Compute Jensen–Shannon divergence between two distributions P and Q.
    140 *
    141 * P and Q are Maps of host to probability.
    142 *
    143 * @param {Map<string, number>} p
    144 * @param {Map<string, number>} q
    145 * @returns {number}
    146 */
    147 function jsDivergence(p, q) {
    148  if (!p.size || !q.size) {
    149    return 0;
    150  }
    151  const m = new Map();
    152  const allKeys = new Set([...p.keys(), ...q.keys()]);
    153  for (const key of allKeys) {
    154    const pv = p.get(key) ?? 0;
    155    const qv = q.get(key) ?? 0;
    156    m.set(key, 0.5 * (pv + qv));
    157  }
    158  const klPM = klDiv(p, m);
    159  const klQM = klDiv(q, m);
    160  return 0.5 * (klPM + klQM);
    161 }
    162 
    163 /**
    164 * Compute average surprisal of a session under a baseline distribution.
    165 *
    166 * For each visit host in the session, surprisal = -log2 P_baseline(host).
    167 * If a host is unseen, a small epsilon is used.
    168 *
    169 * @param {string[]} hosts
    170 * @param {Map<string, number>} baselineDist
    171 * @returns {number}
    172 */
    173 function averageSurprisal(hosts, baselineDist) {
    174  if (!hosts.length || !baselineDist.size) {
    175    return 0;
    176  }
    177  let sum = 0;
    178  for (const host of hosts) {
    179    const prob = baselineDist.get(host) ?? EPS;
    180    sum += -Math.log2(prob);
    181  }
    182  return sum / hosts.length;
    183 }
    184 
    185 /**
    186 *
    187 */
    188 export class MemoriesDriftDetector {
    189  /**
    190   * Convenience helper: compute metrics AND a trigger decision in one call.
    191   *
    192   * @param {object} [options]
    193   * @param {number} [options.triggerQuantile]
    194   * @param {number} [options.evalDeltaCount]
    195   * @returns {Promise<{
    196   *   baselineMetrics: SessionMetric[],
    197   *   deltaMetrics: SessionMetric[],
    198   *   trigger: {
    199   *     jsThreshold: number,
    200   *     surpriseThreshold: number,
    201   *     triggered: boolean,
    202   *     triggeredSessionIds: Array<string|number>,
    203   *   },
    204   * }>}
    205   */
    206  static async computeHistoryDriftAndTrigger(options = {}) {
    207    const { baselineMetrics, deltaMetrics } =
    208      await this.computeHistoryDriftSessionMetrics();
    209 
    210    const trigger = this.computeDriftTriggerFromBaseline(
    211      baselineMetrics,
    212      deltaMetrics,
    213      options
    214    );
    215 
    216    return { baselineMetrics, deltaMetrics, trigger };
    217  }
    218 
    219  /**
    220   * Build SessionMetric[] for a group of sessions, given a baseline distribution.
    221   *
    222   * @param {Array<{ sessionId: string|number, hosts: string[], startMs: number }>} sessions
    223   * @param {Map<string, number>} baselineDist
    224   * @returns {SessionMetric[]}
    225   */
    226  static _buildSessionMetricsForGroup(sessions, baselineDist) {
    227    const metrics = [];
    228 
    229    for (const sess of sessions) {
    230      const sessionHostCounts = new Map();
    231      for (const h of sess.hosts) {
    232        sessionHostCounts.set(h, (sessionHostCounts.get(h) ?? 0) + 1);
    233      }
    234      const sessionDist = normalizeCounts(sessionHostCounts);
    235      const jsScore = jsDivergence(sessionDist, baselineDist);
    236      const avgSurp = averageSurprisal(sess.hosts, baselineDist);
    237 
    238      metrics.push({
    239        sessionId: sess.sessionId,
    240        jsScore,
    241        avgSurprisal: avgSurp,
    242        timestampMs: sess.startMs,
    243      });
    244    }
    245 
    246    metrics.sort((a, b) => (a.timestampMs ?? 0) - (b.timestampMs ?? 0));
    247    return metrics;
    248  }
    249 
    250  /**
    251   * Trigger computation based on a baseline window and recent delta sessions.
    252   *
    253   * @param {SessionMetric[]} baselineMetrics
    254   * @param {SessionMetric[]} deltaMetrics
    255   * @param {object} [options]
    256   * @param {number} [options.triggerQuantile=MemoriesDriftDetector.DEFAULT_TRIGGER_QUANTILE]
    257   * @param {number} [options.evalDeltaCount=MemoriesDriftDetector.DEFAULT_EVAL_DELTA_COUNT]
    258   * @returns {{
    259   *   jsThreshold: number,
    260   *   surpriseThreshold: number,
    261   *   triggered: boolean,
    262   *   triggeredSessionIds: Array<string|number>,
    263   * }}
    264   */
    265  static computeDriftTriggerFromBaseline(
    266    baselineMetrics,
    267    deltaMetrics,
    268    {
    269      triggerQuantile = DEFAULT_TRIGGER_QUANTILE,
    270      evalDeltaCount = DEFAULT_EVAL_DELTA_COUNT,
    271    } = {}
    272  ) {
    273    if (
    274      !Array.isArray(baselineMetrics) ||
    275      !baselineMetrics.length ||
    276      !Array.isArray(deltaMetrics) ||
    277      !deltaMetrics.length
    278    ) {
    279      return {
    280        jsThreshold: 0,
    281        surpriseThreshold: 0,
    282        triggered: false,
    283        triggeredSessionIds: [],
    284      };
    285    }
    286 
    287    const jsBase = baselineMetrics.map(m => m.jsScore ?? 0);
    288    const surpBase = baselineMetrics.map(m => m.avgSurprisal ?? 0);
    289 
    290    const jsThreshold = computeQuantile(jsBase, triggerQuantile);
    291    const surpriseThreshold = computeQuantile(surpBase, triggerQuantile);
    292 
    293    const evalMetrics =
    294      deltaMetrics.length > evalDeltaCount
    295        ? deltaMetrics.slice(-evalDeltaCount)
    296        : deltaMetrics;
    297 
    298    const triggeredSessionIds = [];
    299    for (const m of evalMetrics) {
    300      const jsTriggered = (m.jsScore ?? 0) > jsThreshold;
    301      const surpTriggered = (m.avgSurprisal ?? 0) > surpriseThreshold;
    302      if (jsTriggered || surpTriggered) {
    303        triggeredSessionIds.push(m.sessionId);
    304      }
    305    }
    306 
    307    return {
    308      jsThreshold,
    309      surpriseThreshold,
    310      triggered: !!triggeredSessionIds.length,
    311      triggeredSessionIds,
    312    };
    313  }
    314 
    315  /**
    316   * Compute per-session drift metrics (JS divergence and average surprisal)
    317   * for baseline and delta sessions, based on history around the last
    318   * history memory timestamp.
    319   *
    320   * Baseline window:
    321   *   [last_history_memory_ts - DRIFT_LOOKBACK_DAYS, last_history_memory_ts)
    322   * Delta window:
    323   *   [last_history_memory_ts, now)
    324   *
    325   * If there is no prior history memory timestamp, or if there is not enough
    326   * data to form both baseline and delta, this returns empty arrays.
    327   *
    328   * @returns {Promise<{ baselineMetrics: SessionMetric[], deltaMetrics: SessionMetric[] }>}
    329   */
    330  static async computeHistoryDriftSessionMetrics() {
    331    const lastTsMs = await MemoriesManager.getLastHistoryMemoryTimestamp();
    332    if (!lastTsMs) {
    333      // No prior memories -> no meaningful baseline yet.
    334      return { baselineMetrics: [], deltaMetrics: [] };
    335    }
    336 
    337    const lookbackStartMs = lastTsMs - DRIFT_LOOKBACK_DAYS * MS_PER_DAY;
    338    const cutoffMicros = Math.max(0, lookbackStartMs) * MICROS_PER_MS;
    339 
    340    /** @type {Array<{ place_id:number, url:string, host:string, title:string, visit_date:number }>} */
    341    const rows = [];
    342    await PlacesUtils.withConnectionWrapper(
    343      "MemoriesDriftDetector:computeHistoryDriftSessionMetrics",
    344      async db => {
    345        const stmt = await db.executeCached(DRIFT_HISTORY_SQL, {
    346          cutoff: cutoffMicros,
    347          limit: DRIFT_HISTORY_LIMIT,
    348        });
    349        for (const row of stmt) {
    350          rows.push({
    351            placeId: row.getResultByName("place_id"),
    352            url: row.getResultByName("url"),
    353            host: row.getResultByName("host"),
    354            title: row.getResultByName("title"),
    355            visitDateMicros: row.getResultByName("visit_date"),
    356          });
    357        }
    358      }
    359    );
    360 
    361    if (!rows.length) {
    362      return { baselineMetrics: [], deltaMetrics: [] };
    363    }
    364 
    365    // You can tune gapSec if you want shorter / longer sessions using opts = { gapSec: 900 }
    366    const sessionized = sessionizeVisits(rows);
    367 
    368    // Build sessions keyed by session_id.
    369    /** @type {Map<number, { sessionId: number, hosts: string[], isBaseline: boolean, startMs: number }>} */
    370    const sessions = new Map();
    371 
    372    for (const row of sessionized) {
    373      const sessionId = row.session_id;
    374      const startMs = row.session_start_ms;
    375      const host = row.host;
    376 
    377      if (!host) {
    378        continue;
    379      }
    380 
    381      let sess = sessions.get(sessionId);
    382      if (!sess) {
    383        sess = {
    384          sessionId,
    385          hosts: [],
    386          isBaseline: startMs < lastTsMs,
    387          startMs,
    388        };
    389        sessions.set(sessionId, sess);
    390      }
    391      sess.hosts.push(host);
    392    }
    393 
    394    const baselineSessions = [];
    395    const deltaSessions = [];
    396 
    397    for (const sess of sessions.values()) {
    398      if (sess.isBaseline) {
    399        baselineSessions.push(sess);
    400      } else {
    401        deltaSessions.push(sess);
    402      }
    403    }
    404 
    405    if (!baselineSessions.length || !deltaSessions.length) {
    406      return { baselineMetrics: [], deltaMetrics: [] };
    407    }
    408 
    409    // Build baseline host counts.
    410    const baselineCounts = new Map();
    411    for (const sess of baselineSessions) {
    412      for (const h of sess.hosts) {
    413        baselineCounts.set(h, (baselineCounts.get(h) ?? 0) + 1);
    414      }
    415    }
    416 
    417    const baselineDist = normalizeCounts(baselineCounts);
    418    if (!baselineDist.size) {
    419      return { baselineMetrics: [], deltaMetrics: [] };
    420    }
    421 
    422    const baselineMetrics = this._buildSessionMetricsForGroup(
    423      baselineSessions,
    424      baselineDist
    425    );
    426    const deltaMetrics = this._buildSessionMetricsForGroup(
    427      deltaSessions,
    428      baselineDist
    429    );
    430 
    431    return { baselineMetrics, deltaMetrics };
    432  }
    433 }