tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

RankShortcutsWorkerClass.mjs (18812B)


      1 /* This Source Code Form is subject to the terms of the Mozilla Public
      2 * License, v. 2.0. If a copy of the MPL was not distributed with this
      3 * file, You can obtain one at https://mozilla.org/MPL/2.0/. */
      4 
      5 import { thompsonSampleSort } from "resource://newtab/lib/SmartShortcutsRanker/ThomSample.mjs"; //"ThomSample.mjs";
      6 
      7 /**
      8 * Linear interpolation of values in histogram, wraps from end to beginning
      9 *
     10 * @param {number[]} hist Defines histogram of counts
     11 * @param {number} t Time/Index we are interpolating to
     12 * @returns {normed: number} Normalized number
     13 */
     14 export function interpolateWrappedHistogram(hist, t) {
     15  if (!hist.length) {
     16    return hist;
     17  }
     18  const n = hist.length;
     19  const lo = Math.floor(t) % n;
     20  const hi = (lo + 1) % n;
     21  const frac = t - Math.floor(t);
     22 
     23  // Adjust for negative t
     24  const loWrapped = (lo + n) % n;
     25  const hiWrapped = (hi + n) % n;
     26 
     27  return (1 - frac) * hist[loWrapped] + frac * hist[hiWrapped];
     28 }
     29 /**
     30 * Bayesian update of vec to reflect pvec weighted by tau
     31 *
     32 * @param {number[]} vec Array of values to update
     33 * @param {number[]} pvec Normalized array of values to reference
     34 * @param {number} tau Strength of the update
     35 * @returns {number[]} Normalized array
     36 */
     37 export function bayesHist(vec, pvec, tau) {
     38  if (!pvec || !vec.length || vec.length !== pvec.length) {
     39    return vec;
     40  }
     41  const vsum = vec.reduce((a, b) => a + b, 0);
     42  if (vsum + tau === 0) {
     43    return vec;
     44  }
     45  const bhist = vec.map((v, i) => (v + tau * pvec[i]) / (vsum + tau));
     46  return bhist;
     47 }
     48 
     49 function getCurrentHourOfDay() {
     50  const now = new Date();
     51  return now.getHours() + now.getMinutes() / 60 + now.getSeconds() / 3600;
     52 }
     53 
     54 function getCurrentDayOfWeek() {
     55  const now = new Date();
     56  return now.getDay(); // returns an int not a float
     57 }
     58 
     59 /**
     60 * Compute average clicks and imps over items
     61 * Smooth those averages towards priors
     62 *
     63 * @param {number[]} clicks Array of clicks
     64 * @param {number[]} imps Array of impressions
     65 * @param {number} pos_rate prior for clicks
     66 * @param {number} neg_rate prior for impressions
     67 * @returns {number[]} smoothed click count and impression count
     68 */
     69 
     70 /**
     71 * Normalize values in an array by the sum of the array
     72 *
     73 * @param {number[]} vec Array of values to normalize
     74 * @returns {number[]} Normalized array
     75 */
     76 export function sumNorm(vec) {
     77  if (!vec.length) {
     78    return vec;
     79  }
     80  const vsum = vec.reduce((a, b) => a + b, 0);
     81 
     82  let normed = [];
     83  if (!Number.isFinite(vsum) || vsum !== 0) {
     84    normed = vec.map(v => v / vsum);
     85  } else {
     86    normed = vec;
     87  }
     88  return normed;
     89 }
     90 
     91 /**
     92 * this function normalizes all values in vals and updates
     93 * the running mean and variance in normobj
     94 * normobj stores info to calculate a running mean and variance
     95 * over a feature, this is stored in the shortcut cache
     96 *
     97 * @param {number[]} vals scores to normalize
     98 * @param {object} normobj Dictionary of storing info for running mean var
     99 * @returns {[number, obj]} normalized features and the updated object
    100 */
    101 export function normUpdate(vals, input_normobj) {
    102  if (!vals.length) {
    103    return [vals, input_normobj];
    104  }
    105  let normobj = {};
    106  if (
    107    !input_normobj ||
    108    !Number.isFinite(input_normobj.mean) ||
    109    !Number.isFinite(input_normobj.var)
    110  ) {
    111    normobj = { beta: 1e-3, mean: vals[0], var: 1.0 };
    112  } else {
    113    normobj = { ...input_normobj };
    114  }
    115  let delta = 0;
    116  for (const v of vals) {
    117    delta = v - normobj.mean;
    118    normobj.mean += normobj.beta * delta;
    119    normobj.var =
    120      (1 - normobj.beta) * normobj.var + normobj.beta * delta * delta;
    121  }
    122  if (normobj.var <= 1e-8) {
    123    normobj.var = 1e-8;
    124  }
    125 
    126  const std = Math.sqrt(normobj.var);
    127  const new_vals = vals.map(v => (v - normobj.mean) / std);
    128  return [new_vals, normobj];
    129 }
    130 
    131 /**
    132 * Normalize a dictionary of {key: hist[]} using squared values and column-wise normalization.
    133 * Returns {key: normedHist[]} where each hist[j] is divided by sum_k hist_k[j]^2.
    134 *
    135 * @param {{[key: string]: number}} dict - A dictionary mapping keys to arrays of P(t|s) values.
    136 * @returns {{[key: string]: number}} New dictionary with normalized histograms (P(s|t)).
    137 */
    138 export function normHistDict(dict) {
    139  const keys = Object.keys(dict);
    140  if (keys.length === 0) {
    141    return {};
    142  }
    143 
    144  const t = dict[keys[0]].length;
    145 
    146  // square all hist values to emphasize differences
    147  const squared = {};
    148  for (const [key, hist] of Object.entries(dict)) {
    149    squared[key] = hist.map(v => v * v);
    150  }
    151 
    152  // compute column-wise sums
    153  const colSums = Array(t).fill(0);
    154  for (let j = 0; j < t; j++) {
    155    for (const key of keys) {
    156      colSums[j] += squared[key][j];
    157    }
    158  }
    159 
    160  // normalize
    161  const normalized = {};
    162  for (const [key, row] of Object.entries(squared)) {
    163    normalized[key] = row.map((val, j) => (colSums[j] ? val / colSums[j] : 0));
    164  }
    165 
    166  return normalized;
    167 }
    168 
    169 /**
    170 * Compute linear combination of scores, weighted
    171 *
    172 * @param {object[]} scores Dictionary of scores
    173 * @param {object[]} weights Dictionary of weights
    174 * @returns {number} Linear combination of scores*weights
    175 */
    176 export function computeLinearScore(scores, weights) {
    177  let final = 0;
    178  let score = 0;
    179  for (const [feature, weight] of Object.entries(weights)) {
    180    score = scores[feature] ?? 0;
    181    final += score * weight;
    182  }
    183  return final;
    184 }
    185 
    186 export function processSeasonality(guids, input, tau, curtime) {
    187  const { hists } = input;
    188  const { pvec } = input;
    189  // normalize new/rare site visits using general seasonlity to control against "fliers"
    190  const b_hists = Object.fromEntries(
    191    Object.entries(hists).map(([guid, hist]) => [
    192      guid,
    193      bayesHist(hist, pvec, tau),
    194    ])
    195  );
    196  // convert to P(time | site), prepare for bayes
    197  const timegivensite = Object.fromEntries(
    198    Object.entries(b_hists).map(([guid, hist]) => [guid, sumNorm(hist)])
    199  );
    200  // use bayes to convert to P(site | time)
    201  const sitegiventime = normHistDict(timegivensite);
    202  // interpolate P(site | time) to weight each site
    203  const weights = Object.fromEntries(
    204    Object.entries(sitegiventime).map(([guid, hist]) => [
    205      guid,
    206      interpolateWrappedHistogram(hist, curtime),
    207    ])
    208  );
    209  // make it an array
    210  const weightsarr = guids.map(key => weights[key]);
    211  // normalize to account for interpolation oddities
    212  const normweights = sumNorm(weightsarr);
    213  return normweights;
    214 }
    215 
    216 // Visit type codes in Places
    217 const TYPE = {
    218  LINK: 1,
    219  TYPED: 2,
    220  BOOKMARK: 3,
    221  EMBED: 4,
    222  REDIRECT_PERM: 5,
    223  REDIRECT_TEMP: 6,
    224  DOWNLOAD: 7,
    225  FRAMED_LINK: 8,
    226  RELOAD: 9,
    227 };
    228 
    229 // default bonus map; copy from frecency
    230 const TYPE_SCORE = {
    231  [TYPE.TYPED]: 200,
    232  [TYPE.LINK]: 100,
    233  [TYPE.BOOKMARK]: 75,
    234  [TYPE.RELOAD]: 0,
    235  [TYPE.REDIRECT_PERM]: 0,
    236  [TYPE.REDIRECT_TEMP]: 0,
    237  [TYPE.EMBED]: 0,
    238  [TYPE.FRAMED_LINK]: 0,
    239  [TYPE.DOWNLOAD]: 0,
    240 };
    241 /**
    242 * Build features that break apart frecency into:
    243 *        frequency, recency, re-frecency
    244 * frequency total_visits*F(visit_types)
    245 * recency is exponential decay of last 10 visits
    246 * re-frecency is dot_product(frequency,recency)
    247 *
    248 * difference between frecency and re-frecency is how the
    249 * recency calc is done, recency is exponential decay instead
    250 * of the frecency buckets for interpretability
    251 *
    252 * @param {object} visitCounts guid -> {visit.type, visit.time}
    253 * @param {object} visitCounts guid -> total visit count
    254 * @param {object} opts options for controlling calc
    255 * @returns {Promise<{pvec: number[]|null, hists: any}>}
    256 */
    257 export async function buildFrecencyFeatures(
    258  visitsByGuid,
    259  visitCounts,
    260  opts = {}
    261 ) {
    262  const {
    263    halfLifeDays = 28, // 28 reproduces frecency
    264  } = opts;
    265 
    266  const nowMs = Date.now();
    267  const dayMs = 864e5;
    268  const tauDays = halfLifeDays / Math.log(2);
    269 
    270  // Transposed output: { refre: {guid:...}, rece: {guid:...}, freq: {guid:...} }
    271  const byFeature = { refre: {}, rece: {}, freq: {}, unid: {} };
    272 
    273  for (const [guid, visits] of Object.entries(visitsByGuid)) {
    274    // take the log here, original frecency grows linearly with visits
    275    // lets test out log growth
    276    const total = Math.log((visitCounts?.[guid] ?? 0) + 1);
    277 
    278    const time_scores = [];
    279    const type_scores = [];
    280 
    281    const days_visited = new Set([]);
    282 
    283    for (let i = 0; i < visits.length; i++) {
    284      const { visit_date_us, visit_type } = visits[i];
    285      const ageDays = (nowMs - visit_date_us / 1000) / dayMs;
    286      days_visited.add(Math.floor(ageDays));
    287      const t = Math.exp(-ageDays / tauDays); // exponential decay
    288      const b = TYPE_SCORE[visit_type] ?? 0;
    289      time_scores.push(t);
    290      type_scores.push(b);
    291    }
    292    // dot captures the interaction between time and type, basically frecency
    293    const dot = time_scores.reduce(
    294      (s, x, i) => s + x * (type_scores[i] ?? 0),
    295      0
    296    );
    297    // time_score is a pure recency feature
    298    const time_score = time_scores.reduce((s, x) => s + x, 0);
    299    // type_score is frequency feature weighted by how the user got to the site
    300    const type_score = type_scores.reduce((s, x) => s + x, 0);
    301 
    302    byFeature.refre[guid] = total * dot; // interaction (≈ frecency-like)
    303    byFeature.rece[guid] = time_score; // recency-only
    304    byFeature.freq[guid] = total * type_score; // frequency-only (lifetime-weighted)
    305    byFeature.unid[guid] = days_visited.size;
    306  }
    307 
    308  return byFeature;
    309 }
    310 
    311 // small helpers used only here
    312 const _projectByGuid = (guids, dict) => guids.map(g => dict[g]);
    313 
    314 const _applyVectorFeature = (
    315  namei,
    316  rawVec,
    317  norms,
    318  score_map,
    319  guids,
    320  updated_norms
    321 ) => {
    322  const [vals, n] = normUpdate(rawVec, norms[namei]);
    323  updated_norms[namei] = n;
    324  guids.forEach((g, i) => {
    325    score_map[g][namei] = vals[i];
    326  });
    327 };
    328 
    329 export async function weightedSampleTopSites(input) {
    330  const updated_norms = {};
    331  const score_map = Object.fromEntries(
    332    input.guid.map(guid => [
    333      guid,
    334      Object.fromEntries(input.features.map(f => [f, 0])),
    335    ])
    336  );
    337 
    338  // Table-driven vector features that already exist as per-guid dictionaries
    339  const dictFeatures = {
    340    bmark: () => _projectByGuid(input.guid, input.bmark_scores),
    341    open: () => _projectByGuid(input.guid, input.open_scores),
    342    rece: () => _projectByGuid(input.guid, input.rece_scores),
    343    freq: () => _projectByGuid(input.guid, input.freq_scores),
    344    refre: () => _projectByGuid(input.guid, input.refre_scores),
    345    unid: () => _projectByGuid(input.guid, input.unid_scores),
    346  };
    347 
    348  // 1) Simple vector features
    349  for (const namei of Object.keys(dictFeatures)) {
    350    if (input.features.includes(namei)) {
    351      _applyVectorFeature(
    352        namei,
    353        dictFeatures[namei](),
    354        input.norms,
    355        score_map,
    356        input.guid,
    357        updated_norms
    358      );
    359    }
    360  }
    361 
    362  // 2) CTR feature (derived vector)
    363  if (input.features.includes("ctr")) {
    364    const raw_ctr = input.impressions.map(
    365      (imp, i) => (input.clicks[i] + 1) / (imp + 1)
    366    );
    367    _applyVectorFeature(
    368      "ctr",
    369      raw_ctr,
    370      input.norms,
    371      score_map,
    372      input.guid,
    373      updated_norms
    374    );
    375  }
    376 
    377  // 3) Thompson feature (special case)
    378  if (input.features.includes("thom")) {
    379    const ranked_thetas = await thompsonSampleSort({
    380      key_array: input.guid,
    381      obs_positive: input.clicks,
    382      obs_negative: input.impressions.map((imp, i) =>
    383        Math.max(0, imp - input.clicks[i])
    384      ),
    385      prior_positive: input.clicks.map(() => input.alpha),
    386      prior_negative: input.impressions.map(() => input.beta),
    387      do_sort: false,
    388    });
    389    const [vals, n] = normUpdate(ranked_thetas[1], input.norms.thom);
    390    updated_norms.thom = n;
    391    input.guid.forEach((g, i) => {
    392      score_map[g].thom = vals[i];
    393    });
    394  }
    395 
    396  // 4) Frecency vector (already an array)
    397  if (input.features.includes("frec")) {
    398    const [vals, n] = normUpdate(input.frecency, input.norms.frec);
    399    updated_norms.frec = n;
    400    input.guid.forEach((g, i) => {
    401      score_map[g].frec = vals[i];
    402    });
    403  }
    404 
    405  // 5) Seasonality features
    406  if (input.features.includes("hour")) {
    407    const raw = processSeasonality(
    408      input.guid,
    409      input.hourly_seasonality,
    410      input.tau,
    411      getCurrentHourOfDay()
    412    );
    413    _applyVectorFeature(
    414      "hour",
    415      raw,
    416      input.norms,
    417      score_map,
    418      input.guid,
    419      updated_norms
    420    );
    421  }
    422  if (input.features.includes("daily")) {
    423    const raw = processSeasonality(
    424      input.guid,
    425      input.daily_seasonality,
    426      input.tau,
    427      getCurrentDayOfWeek()
    428    );
    429    _applyVectorFeature(
    430      "daily",
    431      raw,
    432      input.norms,
    433      score_map,
    434      input.guid,
    435      updated_norms
    436    );
    437  }
    438 
    439  // 6) Bias
    440  if (input.features.includes("bias")) {
    441    input.guid.forEach(g => {
    442      score_map[g].bias = 1;
    443    });
    444  }
    445 
    446  // 7) Final score
    447  for (const g of input.guid) {
    448    score_map[g].final = computeLinearScore(score_map[g], input.weights);
    449  }
    450 
    451  return { score_map, norms: updated_norms };
    452 }
    453 
    454 export function clampWeights(weights, maxNorm = 100) {
    455  const norm = Math.hypot(...Object.values(weights));
    456  if (norm > maxNorm) {
    457    const scale = maxNorm / norm;
    458    return Object.fromEntries(
    459      Object.entries(weights).map(([k, v]) => [k, v * scale])
    460    );
    461  }
    462  return weights;
    463 }
    464 
    465 /**
    466 * Update the logistic regression weights for shortcuts ranking using gradient descent.
    467 *
    468 * @param {object} input
    469 * @param {string[]} input.features
    470 *   The list of feature names (keys in weights and score_map[guid]).
    471 * @param {object} input.data
    472 *   Mapping guid -> { clicks: number, impressions: number }.
    473 * @param {object} input.scores
    474 *   Mapping guid -> { final: number, [feature]: number } (final = w·x).
    475 * @param {object} input.weights
    476 *   Current weight vector, keyed by feature
    477 * @param {number} input.eta
    478 *   Learning rate.
    479 * @param {number} [input.click_bonus=1]
    480 *   Multiplicative weight for click events.
    481 * @param {boolean} [do_clamp=true]
    482 *   If true, clamp weights after update.
    483 * @returns {object}
    484 *   Updated weights object.
    485 */
    486 export function updateWeights(input, do_clamp = true) {
    487  const { features } = input;
    488  const { data } = input;
    489  const score_map = input.scores;
    490  let { weights } = input;
    491  const { eta } = input;
    492  const click_bonus = input.click_bonus ?? 1;
    493 
    494  const grads = Object.create(null);
    495  let total = 0;
    496 
    497  // init gradient accumulator
    498  for (let j = 0; j < features.length; j += 1) {
    499    grads[features[j]] = 0;
    500  }
    501 
    502  for (const guid of Object.keys(data)) {
    503    if (
    504      score_map &&
    505      score_map[guid] &&
    506      typeof score_map[guid].final === "number"
    507    ) {
    508      /* impressions without a click can happen for many reasons
    509      that are unrelated to the item. clicks are almost
    510      always a deliberate action. therefore, we should
    511      learn more from clicks. the click_bonus can be viewed
    512      as a weight on click events and biases the model
    513       to learning more from clicks relative to impressions
    514      */
    515      const clicks = (data[guid].clicks | 0) * click_bonus;
    516      const impressions = data[guid].impressions | 0;
    517      if (clicks === 0 && impressions === 0) {
    518        continue;
    519      }
    520 
    521      const z = score_map[guid].final;
    522      const p = 1 / (1 + Math.exp(-z)); // sigmoid
    523      const factor = clicks * (p - 1) + impressions * p;
    524 
    525      for (const feature of features) {
    526        const fval = score_map[guid][feature] || 0;
    527        grads[feature] += factor * fval;
    528      }
    529 
    530      total += clicks + impressions;
    531    }
    532  }
    533 
    534  if (total > 0) {
    535    const scale = eta / total;
    536    for (const feature of features) {
    537      weights[feature] -= scale * grads[feature];
    538    }
    539  }
    540  if (do_clamp) {
    541    weights = clampWeights(weights);
    542  }
    543  return weights;
    544 }
    545 
    546 /**
    547 * Reorder GUIDs into their desired positions.
    548 * - First claimant for a slot gets it.
    549 * - Collisions are resolved by filling remaining slots left→right in original order.
    550 * - If a requested position >= guids.length, that item goes to the last slot.
    551 *
    552 * @param {string[]} guids
    553 * @param {Map<string, number>} posMap  Map of guid → desired 0-based index
    554 * @returns {string[]} reordered guids
    555 */
    556 export function placeGuidsByPositions(guids, posMap) {
    557  const size = guids.length;
    558  const out = Array(size).fill(null);
    559  const placed = new Set();
    560 
    561  // Pass 1: try to place at desired index (clamp > size-1 to size-1)
    562  for (let i = 0; i < guids.length; i++) {
    563    const g = guids[i];
    564    let idx = posMap.get(g);
    565    if (!Number.isInteger(idx) || idx < 0) {
    566      continue;
    567    }
    568 
    569    if (idx >= size) {
    570      idx = size - 1; // clamp oversize → last slot
    571    }
    572 
    573    if (out[idx] === null) {
    574      out[idx] = g;
    575      placed.add(i);
    576    }
    577    // collisions handled in pass 2
    578  }
    579 
    580  // Pass 2: fill holes with unplaced guids, in input order
    581  let cursor = 0;
    582  const putNext = item => {
    583    while (cursor < size && out[cursor] !== null) {
    584      cursor++;
    585    }
    586    if (cursor < size) {
    587      out[cursor++] = item;
    588    }
    589  };
    590 
    591  for (let i = 0; i < guids.length; i++) {
    592    if (!placed.has(i)) {
    593      putNext(guids[i]);
    594    }
    595  }
    596 
    597  return out;
    598 }
    599 
    600 /**
    601 * Given last-click positions aligned to guids, shift by numSponsored (clamp negatives to 0,
    602 * preserve nulls), then place GUIDs accordingly.
    603 *
    604 * Positions and guids are each arrays in the same order, here we map
    605 * guid to positions, we do this here to put as much as possible on the promise
    606 * we shift from "positions" which is absolute shortcut position to array index
    607 * which ignores the sponsored shortcuts.
    608 *
    609 * This function has a known shortcoming where the positions will be incorrect if the number
    610 * of sponsored shortcuts changes. We accept this because 1) changes should be very rare
    611 * 2) it fails safely 3) the differences should be off by the change in the number of sponsored
    612 * shortcuts which is at most 3
    613 *
    614 * @param {(number|null|undefined)[]} positions  // aligned with guids
    615 * @param {string[]} guids
    616 * @param {number} numSponsored
    617 * @returns {string[]} reordered guids
    618 */
    619 export function applyStickyClicks(positions, guids, numSponsored) {
    620  const guidToPos = new Map(
    621    guids.map((g, i) => {
    622      const pos = positions[i];
    623      if (pos === null) {
    624        return [g, null]; // preserve nulls
    625      }
    626      const shifted = pos - numSponsored;
    627      return [g, shifted < 0 ? 0 : shifted]; // clamp negatives
    628    })
    629  );
    630 
    631  // Use either variant depending on how you want collisions handled:
    632  return placeGuidsByPositions(guids, guidToPos, { oneBased: false });
    633 }
    634 
    635 export class RankShortcutsWorker {
    636  async weightedSampleTopSites(input) {
    637    return weightedSampleTopSites(input);
    638  }
    639  async sumNorm(vec) {
    640    return sumNorm(vec);
    641  }
    642  async updateWeights(input) {
    643    return updateWeights(input);
    644  }
    645  async buildFrecencyFeatures(raw_frec, visit_totals) {
    646    return buildFrecencyFeatures(raw_frec, visit_totals);
    647  }
    648  async applyStickyClicks(positions, topsites, numSponsored) {
    649    return applyStickyClicks(positions, topsites, numSponsored);
    650  }
    651 }