tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

SearchBrowsingHistoryDomainBoost.sys.mjs (8859B)


      1 /**
      2 * This Source Code Form is subject to the terms of the Mozilla Public
      3 * License, v. 2.0. If a copy of the MPL was not distributed with this
      4 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
      5 */
      6 
      7 /**
      8 * SearchBrowsingHistoryDomainBoost
      9 *
     10 * Temporary heuristic for general-category queries (games, movies, news, etc.)
     11 * when semantic embeddings over title/description are insufficient.
     12 *
     13 * Safe to remove once richer embeddings or better intent classification lands.
     14 */
     15 
     16 export const CATEGORIES_JSON = {
     17  language: "en",
     18  categories: [
     19    {
     20      id: "games",
     21      terms: [
     22        "game",
     23        "games",
     24        "video game",
     25        "video games",
     26        "pc games",
     27        "console games",
     28      ],
     29      domains: [
     30        "store.steampowered.com",
     31        "roblox.com",
     32        "ign.com",
     33        "gamespot.com",
     34        "polygon.com",
     35        "metacritic.com",
     36        "epicgames.com",
     37        "store.playstation.com",
     38        "xbox.com",
     39        "nintendo.com",
     40      ],
     41    },
     42    {
     43      id: "movies",
     44      terms: ["movie", "movies", "film", "films", "cinema"],
     45      domains: [
     46        "imdb.com",
     47        "rottentomatoes.com",
     48        "metacritic.com",
     49        "letterboxd.com",
     50        "netflix.com",
     51        "primevideo.com",
     52        "disneyplus.com",
     53        "hulu.com",
     54        "max.com",
     55      ],
     56    },
     57    {
     58      id: "tv",
     59      terms: ["tv show", "tv shows", "show", "shows", "series", "tv series"],
     60      domains: [
     61        "imdb.com",
     62        "rottentomatoes.com",
     63        "metacritic.com",
     64        "tvmaze.com",
     65        "thetvdb.com",
     66        "netflix.com",
     67        "primevideo.com",
     68        "disneyplus.com",
     69        "hulu.com",
     70        "max.com",
     71      ],
     72    },
     73    {
     74      id: "books",
     75      terms: ["book", "books", "novel", "novels"],
     76      domains: [
     77        "goodreads.com",
     78        "gutenberg.org",
     79        "openlibrary.org",
     80        "barnesandnoble.com",
     81        "indigo.ca",
     82      ],
     83    },
     84    {
     85      id: "anime",
     86      terms: ["anime", "manga"],
     87      domains: [
     88        "myanimelist.net",
     89        "anilist.co",
     90        "kitsu.app",
     91        "crunchyroll.com",
     92      ],
     93    },
     94    {
     95      id: "music",
     96      terms: ["music", "song", "songs", "album", "albums", "lyrics"],
     97      domains: [
     98        "spotify.com",
     99        "music.apple.com",
    100        "soundcloud.com",
    101        "bandcamp.com",
    102        "music.youtube.com",
    103      ],
    104    },
    105    {
    106      id: "podcasts",
    107      terms: ["podcast", "podcasts"],
    108      domains: [
    109        "podcasts.apple.com",
    110        "overcast.fm",
    111        "pocketcasts.com",
    112        "castbox.fm",
    113      ],
    114    },
    115    {
    116      id: "papers_research",
    117      terms: [
    118        "paper",
    119        "papers",
    120        "research paper",
    121        "research papers",
    122        "academic paper",
    123        "academic papers",
    124        "journal",
    125        "journals",
    126        "study",
    127        "studies",
    128        "publication",
    129        "publications",
    130      ],
    131      domains: [
    132        "scholar.google.com",
    133        "arxiv.org",
    134        "semanticscholar.org",
    135        "pubmed.ncbi.nlm.nih.gov",
    136        "researchgate.net",
    137        "ieeexplore.ieee.org",
    138        "dl.acm.org",
    139        "springer.com",
    140        "nature.com",
    141        "science.org",
    142      ],
    143    },
    144    {
    145      id: "tech_news",
    146      terms: ["tech news", "technology news", "startup news"],
    147      domains: [
    148        "theverge.com",
    149        "techcrunch.com",
    150        "wired.com",
    151        "arstechnica.com",
    152        "engadget.com",
    153      ],
    154    },
    155    {
    156      id: "finance_news",
    157      terms: ["finance news", "business news", "market news", "stock news"],
    158      domains: [
    159        "bloomberg.com",
    160        "wsj.com",
    161        "ft.com",
    162        "reuters.com",
    163        "cnbc.com",
    164      ],
    165    },
    166    {
    167      id: "news",
    168      terms: [
    169        "news",
    170        "headline",
    171        "headlines",
    172        "breaking news",
    173        "world news",
    174        "latest news",
    175      ],
    176      domains: [
    177        "reuters.com",
    178        "apnews.com",
    179        "bbc.com",
    180        "cnn.com",
    181        "nytimes.com",
    182        "theguardian.com",
    183        "washingtonpost.com",
    184        "aljazeera.com",
    185        "npr.org",
    186        "wsj.com",
    187        "bloomberg.com",
    188        "ft.com",
    189      ],
    190    },
    191    {
    192      id: "recipes",
    193      terms: [
    194        "recipe",
    195        "recipes",
    196        "cooking",
    197        "food",
    198        "dinner ideas",
    199        "meal prep",
    200      ],
    201      domains: [
    202        "allrecipes.com",
    203        "seriouseats.com",
    204        "foodnetwork.com",
    205        "bbcgoodfood.com",
    206        "epicurious.com",
    207        "nytcooking.com",
    208      ],
    209    },
    210    {
    211      id: "travel",
    212      terms: ["travel", "hotels", "places", "destinations", "things to do"],
    213      domains: [
    214        "tripadvisor.com",
    215        "booking.com",
    216        "expedia.com",
    217        "airbnb.com",
    218        "lonelyplanet.com",
    219      ],
    220    },
    221  ],
    222 };
    223 
    224 /**
    225 * Normalizes a query string into a lowercase, space-separated form suitable for matching
    226 * and comparison.
    227 *
    228 * @param {string} s
    229 * @returns {string}
    230 */
    231 function normalizeQuery(s) {
    232  return (s || "")
    233    .toLowerCase()
    234    .replace(/[^\p{L}\p{N}]+/gu, " ")
    235    .replace(/\s+/g, " ")
    236    .trim();
    237 }
    238 
    239 /**
    240 * Returns the matched category domains if searchTerm looks like a general category query.
    241 * Uses phrase matching on normalized query string.
    242 *
    243 * @param {string} searchTerm
    244 * @param {object} [categoriesJson=CATEGORIES_JSON]
    245 * @returns {string[]|null}
    246 */
    247 export function matchDomains(searchTerm, categoriesJson = CATEGORIES_JSON) {
    248  const q = ` ${normalizeQuery(searchTerm)} `;
    249  if (!q.trim()) {
    250    return null;
    251  }
    252 
    253  for (const cat of categoriesJson.categories) {
    254    for (const t of cat.terms) {
    255      // Pad with spaces to enable whole-token phrase matching via includes.
    256      const tt = ` ${normalizeQuery(t)} `;
    257      if (tt.trim() && q.includes(tt)) {
    258        return cat.domains;
    259      }
    260    }
    261  }
    262 
    263  return null;
    264 }
    265 
    266 /**
    267 * Builds a SQL WHERE clause for matching `http`/`https` URLs belonging
    268 * to the given root domains and their `www` variants.
    269 *
    270 * @param {string[]} domains
    271 * @returns {{ where: string, params: object }}
    272 */
    273 function buildDomainUrlWhere(domains) {
    274  const clauses = [];
    275  const params = {};
    276  let i = 0;
    277 
    278  for (const raw of domains || []) {
    279    const d = String(raw).toLowerCase();
    280    if (!d) {
    281      continue;
    282    }
    283 
    284    // - https://domain/...
    285    // - https://www.domain/...
    286    params[`d${i}`] = `%://${d}/%`;
    287    clauses.push(`lower(url) LIKE :d${i++}`);
    288 
    289    params[`d${i}`] = `%://www.${d}/%`;
    290    clauses.push(`lower(url) LIKE :d${i++}`);
    291  }
    292 
    293  return {
    294    where: clauses.length ? `(${clauses.join(" OR ")})` : "0",
    295    params,
    296  };
    297 }
    298 
    299 /**
    300 * Domain-filtered moz_places query (time-windowed).
    301 *
    302 * @param {object} params
    303 * @param {object} params.conn
    304 * @param {string[]} params.domains
    305 * @param {number|null} params.startTs
    306 * @param {number|null} params.endTs
    307 * @param {number} params.historyLimit
    308 * @param {Function} params.buildHistoryRow
    309 * @returns {Promise<object[]>}
    310 */
    311 export async function searchByDomains({
    312  conn,
    313  domains,
    314  startTs,
    315  endTs,
    316  historyLimit,
    317  buildHistoryRow,
    318 }) {
    319  if (!conn || !Array.isArray(domains) || !domains.length) {
    320    return [];
    321  }
    322 
    323  const { where, params } = buildDomainUrlWhere(domains);
    324 
    325  const results = await conn.executeCached(
    326    `
    327      SELECT id,
    328             title,
    329             url,
    330             NULL AS distance,
    331             visit_count,
    332             frecency,
    333             last_visit_date,
    334             preview_image_url
    335      FROM moz_places
    336      WHERE frecency <> 0
    337        AND (:startTs IS NULL OR last_visit_date >= :startTs)
    338        AND (:endTs IS NULL OR last_visit_date <= :endTs)
    339        AND ${where}
    340      ORDER BY last_visit_date DESC, frecency DESC
    341      LIMIT :limit
    342    `,
    343    {
    344      startTs,
    345      endTs,
    346      limit: historyLimit,
    347      ...params,
    348    }
    349  );
    350 
    351  const rows = [];
    352  for (const row of results) {
    353    rows.push(await buildHistoryRow(row));
    354  }
    355  return rows;
    356 }
    357 
    358 /**
    359 * Merge two result lists, keeping `primary` order, then topping up from `secondary`,
    360 * while de-duping by url (fallback to id).
    361 *
    362 * @param {object[]} primary
    363 * @param {object[]} secondary
    364 * @param {number} limit
    365 * @returns {object[]}
    366 */
    367 export function mergeDedupe(primary, secondary, limit) {
    368  const seen = new Set();
    369  const out = [];
    370 
    371  const keyOf = r => r?.url || r?.id;
    372 
    373  for (const r of primary || []) {
    374    const k = keyOf(r);
    375    if (!seen.has(k)) {
    376      seen.add(k);
    377      out.push(r);
    378      if (out.length >= limit) {
    379        return out;
    380      }
    381    }
    382  }
    383 
    384  for (const r of secondary || []) {
    385    const k = keyOf(r);
    386    if (!seen.has(k)) {
    387      seen.add(k);
    388      out.push(r);
    389      if (out.length >= limit) {
    390        return out;
    391      }
    392    }
    393  }
    394 
    395  return out;
    396 }
    397 
    398 export const SearchBrowsingHistoryDomainBoost = Object.freeze({
    399  matchDomains,
    400  searchByDomains,
    401  mergeDedupe,
    402 });