tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

commit a06e07a18962bfceed8da6aeec09a6e613bef318
parent 0d81523fc001f74ee970a750c20d994216f4f161
Author: frankjc2022 <frankjc2022@gmail.com>
Date:   Tue, 30 Dec 2025 01:18:22 +0000

Bug 2006430 - Add workaround for general category queries r=tzhang,ai-models-reviewers

Differential Revision: https://phabricator.services.mozilla.com/D277615

Diffstat:
Mbrowser/components/aiwindow/models/SearchBrowsingHistory.sys.mjs | 28++++++++++++++++++++++++++++
Abrowser/components/aiwindow/models/SearchBrowsingHistoryDomainBoost.sys.mjs | 396+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mbrowser/components/aiwindow/models/moz.build | 1+
Abrowser/components/aiwindow/models/tests/xpcshell/test_SearchBrowsingHistoryDomainBoost.js | 53+++++++++++++++++++++++++++++++++++++++++++++++++++++
Mbrowser/components/aiwindow/models/tests/xpcshell/xpcshell.toml | 2++
5 files changed, 480 insertions(+), 0 deletions(-)

diff --git a/browser/components/aiwindow/models/SearchBrowsingHistory.sys.mjs b/browser/components/aiwindow/models/SearchBrowsingHistory.sys.mjs @@ -11,6 +11,9 @@ ChromeUtils.defineESModuleGetters(lazy, { PlacesUtils: "resource://gre/modules/PlacesUtils.sys.mjs", getPlacesSemanticHistoryManager: "resource://gre/modules/PlacesSemanticHistoryManager.sys.mjs", + // Domain fallback / workaround for general-category queries (games, movies, etc.) + SearchBrowsingHistoryDomainBoost: + "moz-src:///browser/components/aiwindow/models/SearchBrowsingHistoryDomainBoost.sys.mjs", }); /** @@ -281,6 +284,31 @@ async function searchBrowsingHistorySemantic({ for (let row of results) { rows.push(await buildHistoryRow(row)); } + + // Domain fallback for general-category queries (games, movies, news, etc.) + // Keep semantic ranking primary, only top-up if we have room. + if (rows.length < historyLimit) { + const domains = + lazy.SearchBrowsingHistoryDomainBoost.matchDomains(searchTerm); + if (domains?.length) { + const domainRows = + await lazy.SearchBrowsingHistoryDomainBoost.searchByDomains({ + conn, + domains, + startTs, + endTs, + historyLimit: Math.max(historyLimit * 2, 200), // extra for dedupe + buildHistoryRow, + }); + + return lazy.SearchBrowsingHistoryDomainBoost.mergeDedupe( + rows, + domainRows, + historyLimit + ); + } + } + return rows; } diff --git a/browser/components/aiwindow/models/SearchBrowsingHistoryDomainBoost.sys.mjs b/browser/components/aiwindow/models/SearchBrowsingHistoryDomainBoost.sys.mjs @@ -0,0 +1,396 @@ +/** + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +/** + * SearchBrowsingHistoryDomainBoost + * + * Temporary heuristic for general-category queries (games, movies, news, etc.) + * when semantic embeddings over title/description are insufficient. + * + * Safe to remove once richer embeddings or better intent classification lands. + */ + +export const CATEGORIES_JSON = { + language: "en", + categories: [ + { + id: "games", + terms: [ + "game", + "games", + "video game", + "video games", + "pc games", + "console games", + ], + domains: [ + "steampowered.com", + "roblox.com", + "ign.com", + "gamespot.com", + "polygon.com", + "metacritic.com", + "epicgames.com", + "store.playstation.com", + "xbox.com", + "nintendo.com", + ], + }, + { + id: "movies", + terms: ["movie", "movies", "film", "films", "cinema"], + domains: [ + "imdb.com", + "rottentomatoes.com", + "metacritic.com", + "letterboxd.com", + "netflix.com", + "primevideo.com", + "disneyplus.com", + "hulu.com", + "max.com", + ], + }, + { + id: "tv", + terms: ["tv show", "tv shows", "show", "shows", "series", "tv series"], + domains: [ + "imdb.com", + "rottentomatoes.com", + "metacritic.com", + "tvmaze.com", + "thetvdb.com", + "netflix.com", + "primevideo.com", + "disneyplus.com", + "hulu.com", + "max.com", + ], + }, + { + id: "books", + terms: ["book", "books", "novel", "novels"], + domains: [ + "goodreads.com", + "gutenberg.org", + "openlibrary.org", + "barnesandnoble.com", + "indigo.ca", + ], + }, + { + id: "anime", + terms: ["anime", "manga"], + domains: [ + "myanimelist.net", + "anilist.co", + "kitsu.app", + "crunchyroll.com", + ], + }, + { + id: "music", + terms: ["music", "song", "songs", "album", "albums", "lyrics"], + domains: [ + "spotify.com", + "music.apple.com", + "soundcloud.com", + "bandcamp.com", + "music.youtube.com", + ], + }, + { + id: "podcasts", + terms: ["podcast", "podcasts"], + domains: [ + "podcasts.apple.com", + "overcast.fm", + "pocketcasts.com", + "castbox.fm", + ], + }, + { + id: "papers_research", + terms: [ + "paper", + "papers", + "research paper", + "research papers", + "academic paper", + "academic papers", + "journal", + "journals", + "study", + "studies", + "publication", + "publications", + ], + domains: [ + "scholar.google.com", + "arxiv.org", + "semanticscholar.org", + "pubmed.ncbi.nlm.nih.gov", + "researchgate.net", + "ieeexplore.ieee.org", + "dl.acm.org", + "springer.com", + "nature.com", + "science.org", + ], + }, + { + id: "tech_news", + terms: ["tech news", "technology news", "startup news"], + domains: [ + "theverge.com", + "techcrunch.com", + "wired.com", + "arstechnica.com", + "engadget.com", + ], + }, + { + id: "finance_news", + terms: ["finance news", "business news", "market news", "stock news"], + domains: [ + "bloomberg.com", + "wsj.com", + "ft.com", + "reuters.com", + "cnbc.com", + ], + }, + { + id: "news", + terms: [ + "news", + "headline", + "headlines", + "breaking news", + "world news", + "latest news", + ], + domains: [ + "reuters.com", + "apnews.com", + "bbc.com", + "cnn.com", + "nytimes.com", + "theguardian.com", + "washingtonpost.com", + "aljazeera.com", + "npr.org", + "wsj.com", + "bloomberg.com", + "ft.com", + ], + }, + { + id: "recipes", + terms: [ + "recipe", + "recipes", + "cooking", + "food", + "dinner ideas", + "meal prep", + ], + domains: [ + "allrecipes.com", + "seriouseats.com", + "foodnetwork.com", + "bbcgoodfood.com", + "epicurious.com", + "nytcooking.com", + ], + }, + { + id: "travel", + terms: ["travel", "hotels", "places", "destinations", "things to do"], + domains: [ + "tripadvisor.com", + "booking.com", + "expedia.com", + "airbnb.com", + "lonelyplanet.com", + ], + }, + ], +}; + +/** + * Normalizes a query string into a lowercase, space-separated form suitable for matching + * and comparison. + * + * @param {string} s + * @returns {string} + */ +function normalizeQuery(s) { + return (s || "") + .toLowerCase() + .replace(/[^\p{L}\p{N}]+/gu, " ") + .replace(/\s+/g, " ") + .trim(); +} + +/** + * Returns the matched category domains if searchTerm looks like a general category query. + * Uses phrase matching on normalized query string. + * + * @param {string} searchTerm + * @param {object} [categoriesJson=CATEGORIES_JSON] + * @returns {string[]|null} + */ +export function matchDomains(searchTerm, categoriesJson = CATEGORIES_JSON) { + const q = ` ${normalizeQuery(searchTerm)} `; + if (!q.trim()) { + return null; + } + + for (const cat of categoriesJson.categories) { + for (const t of cat.terms) { + // Pad with spaces to enable whole-token phrase matching via includes. + const tt = ` ${normalizeQuery(t)} `; + if (tt.trim() && q.includes(tt)) { + return cat.domains; + } + } + } + + return null; +} + +/** + * Builds a SQL WHERE clause for matching `http`/`https` URLs belonging + * to the given root domains and their `www` variants. + * + * @param {string[]} domains + * @returns {{ where: string, params: object }} + */ +function buildDomainUrlWhere(domains) { + const clauses = []; + const params = {}; + let i = 0; + + for (const raw of domains || []) { + const d = String(raw).toLowerCase(); + if (!d) { + continue; + } + + // - https://domain/... + // - https://www.domain/... + params[`d${i}`] = `%://${d}/%`; + clauses.push(`lower(url) LIKE :d${i++}`); + + params[`d${i}`] = `%://www.${d}/%`; + clauses.push(`lower(url) LIKE :d${i++}`); + } + + return { + where: clauses.length ? `(${clauses.join(" OR ")})` : "0", + params, + }; +} + +/** + * Domain-filtered moz_places query (time-windowed). + * + * @param {object} params + * @param {object} params.conn + * @param {string[]} params.domains + * @param {number|null} params.startTs + * @param {number|null} params.endTs + * @param {number} params.historyLimit + * @param {Function} params.buildHistoryRow + * @returns {Promise<object[]>} + */ +export async function searchByDomains({ + conn, + domains, + startTs, + endTs, + historyLimit, + buildHistoryRow, +}) { + if (!conn || !Array.isArray(domains) || !domains.length) { + return []; + } + + const { where, params } = buildDomainUrlWhere(domains); + + const results = await conn.executeCached( + ` + SELECT id, + title, + url, + NULL AS distance, + visit_count, + frecency, + last_visit_date, + preview_image_url + FROM moz_places + WHERE frecency <> 0 + AND (:startTs IS NULL OR last_visit_date >= :startTs) + AND (:endTs IS NULL OR last_visit_date <= :endTs) + AND ${where} + ORDER BY last_visit_date DESC, frecency DESC + LIMIT :limit + `, + { + startTs, + endTs, + limit: historyLimit, + ...params, + } + ); + + const rows = []; + for (const row of results) { + rows.push(await buildHistoryRow(row)); + } + return rows; +} + +/** + * Merge two result lists, keeping `primary` order, then topping up from `secondary`, + * while de-duping by url (fallback to id). + * + * @param {object[]} primary + * @param {object[]} secondary + * @param {number} limit + * @returns {object[]} + */ +export function mergeDedupe(primary, secondary, limit) { + const seen = new Set(); + const out = []; + + const keyOf = r => r?.url || r?.id; + + for (const r of primary || []) { + const k = keyOf(r); + if (!seen.has(k)) { + seen.add(k); + out.push(r); + if (out.length >= limit) { + return out; + } + } + } + + for (const r of secondary || []) { + const k = keyOf(r); + if (!seen.has(k)) { + seen.add(k); + out.push(r); + if (out.length >= limit) { + return out; + } + } + } + + return out; +} diff --git a/browser/components/aiwindow/models/moz.build b/browser/components/aiwindow/models/moz.build @@ -28,6 +28,7 @@ MOZ_SRC_FILES += [ "InsightsSchemas.sys.mjs", "IntentClassifier.sys.mjs", "SearchBrowsingHistory.sys.mjs", + "SearchBrowsingHistoryDomainBoost.sys.mjs", "TitleGeneration.sys.mjs", "Tools.sys.mjs", "Utils.sys.mjs", diff --git a/browser/components/aiwindow/models/tests/xpcshell/test_SearchBrowsingHistoryDomainBoost.js b/browser/components/aiwindow/models/tests/xpcshell/test_SearchBrowsingHistoryDomainBoost.js @@ -0,0 +1,53 @@ +/** + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +const { matchDomains, mergeDedupe } = ChromeUtils.importESModule( + "moz-src:///browser/components/aiwindow/models/SearchBrowsingHistoryDomainBoost.sys.mjs" +); + +add_task(async function test_matchDomains_games_and_boundary_behavior() { + // Positive: should match games category + const domains = matchDomains("video games"); + Assert.ok( + domains?.includes("steampowered.com"), + "Should include steampowered.com for games" + ); + + // Negative: should not match substrings inside words ("endgame" should not trigger "game") + const domains2 = matchDomains("endgame"); + Assert.equal(domains2, null, "Should not match 'game' inside 'endgame'"); +}); + +add_task(async function test_matchDomains_prefers_longer_phrases() { + // "tech news" should match tech_news (not generic news) + const domains = matchDomains("tech news"); + Assert.ok( + domains?.includes("techcrunch.com"), + "Should match tech_news domains" + ); + Assert.ok( + !domains.includes("reuters.com"), + "Should not fall back to generic news domains" + ); +}); + +add_task(async function test_mergeDedupe_semantic_first_then_topup() { + const primary = [ + { id: 1, url: "https://example.com/a", title: "A" }, + { id: 2, url: "https://example.com/b", title: "B" }, + ]; + const secondary = [ + { id: 3, url: "https://example.com/b", title: "B dup" }, // dup by url + { id: 4, url: "https://example.com/c", title: "C" }, + ]; + + const out = mergeDedupe(primary, secondary, 10); + Assert.deepEqual( + out.map(r => r.url), + ["https://example.com/a", "https://example.com/b", "https://example.com/c"], + "Should keep primary order and de-dupe by url" + ); +}); diff --git a/browser/components/aiwindow/models/tests/xpcshell/xpcshell.toml b/browser/components/aiwindow/models/tests/xpcshell/xpcshell.toml @@ -28,6 +28,8 @@ support-files = [] ["test_SearchBrowsingHistory.js"] +["test_SearchBrowsingHistoryDomainBoost.js"] + ["test_TitleGeneration.js"] ["test_Tools_GetOpenTabs.js"]