tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

commit 738aad16509302b669354d90ac832a543ccb9691
parent 414417ee92315e4fbb07658d53e2e5d6adaa609c
Author: pstanciu <pstanciu@mozilla.com>
Date:   Mon, 29 Dec 2025 23:47:57 +0200

Revert "Bug 2006430 - Add workaround for general category queries r=tzhang,ai-models-reviewers" for causing bc failures @ browser_all_files_referenced.js

This reverts commit b9eea0033f00d7f08df55efd3b5ac7facb2ab963.

Diffstat:
Mbrowser/components/aiwindow/models/SearchBrowsingHistory.sys.mjs | 28----------------------------
Dbrowser/components/aiwindow/models/SearchBrowsingHistoryDomainBoost.sys.mjs | 396-------------------------------------------------------------------------------
Mbrowser/components/aiwindow/models/moz.build | 1-
Dbrowser/components/aiwindow/models/tests/xpcshell/test_SearchBrowsingHistoryDomainBoost.js | 53-----------------------------------------------------
Mbrowser/components/aiwindow/models/tests/xpcshell/xpcshell.toml | 2--
5 files changed, 0 insertions(+), 480 deletions(-)

diff --git a/browser/components/aiwindow/models/SearchBrowsingHistory.sys.mjs b/browser/components/aiwindow/models/SearchBrowsingHistory.sys.mjs @@ -11,9 +11,6 @@ ChromeUtils.defineESModuleGetters(lazy, { PlacesUtils: "resource://gre/modules/PlacesUtils.sys.mjs", getPlacesSemanticHistoryManager: "resource://gre/modules/PlacesSemanticHistoryManager.sys.mjs", - // Domain fallback / workaround for general-category queries (games, movies, etc.) - SearchBrowsingHistoryDomainBoost: - "resource://gre/modules/SearchBrowsingHistoryDomainBoost.sys.mjs", }); /** @@ -284,31 +281,6 @@ async function searchBrowsingHistorySemantic({ for (let row of results) { rows.push(await buildHistoryRow(row)); } - - // Domain fallback for general-category queries (games, movies, news, etc.) - // Keep semantic ranking primary, only top-up if we have room. - if (rows.length < historyLimit) { - const domains = - lazy.SearchBrowsingHistoryDomainBoost.matchDomains(searchTerm); - if (domains?.length) { - const domainRows = - await lazy.SearchBrowsingHistoryDomainBoost.searchByDomains({ - conn, - domains, - startTs, - endTs, - historyLimit: Math.max(historyLimit * 2, 200), // extra for dedupe - buildHistoryRow, - }); - - return lazy.SearchBrowsingHistoryDomainBoost.mergeDedupe( - rows, - domainRows, - historyLimit - ); - } - } - return rows; } diff --git a/browser/components/aiwindow/models/SearchBrowsingHistoryDomainBoost.sys.mjs b/browser/components/aiwindow/models/SearchBrowsingHistoryDomainBoost.sys.mjs @@ -1,396 +0,0 @@ -/** - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. - */ - -/** - * SearchBrowsingHistoryDomainBoost - * - * Temporary heuristic for general-category queries (games, movies, news, etc.) - * when semantic embeddings over title/description are insufficient. - * - * Safe to remove once richer embeddings or better intent classification lands. - */ - -export const CATEGORIES_JSON = { - language: "en", - categories: [ - { - id: "games", - terms: [ - "game", - "games", - "video game", - "video games", - "pc games", - "console games", - ], - domains: [ - "steampowered.com", - "roblox.com", - "ign.com", - "gamespot.com", - "polygon.com", - "metacritic.com", - "epicgames.com", - "store.playstation.com", - "xbox.com", - "nintendo.com", - ], - }, - { - id: "movies", - terms: ["movie", "movies", "film", "films", "cinema"], - domains: [ - "imdb.com", - "rottentomatoes.com", - "metacritic.com", - "letterboxd.com", - "netflix.com", - "primevideo.com", - "disneyplus.com", - "hulu.com", - "max.com", - ], - }, - { - id: "tv", - terms: ["tv show", "tv shows", "show", "shows", "series", "tv series"], - domains: [ - "imdb.com", - "rottentomatoes.com", - "metacritic.com", - "tvmaze.com", - "thetvdb.com", - "netflix.com", - "primevideo.com", - "disneyplus.com", - "hulu.com", - "max.com", - ], - }, - { - id: "books", - terms: ["book", "books", "novel", "novels"], - domains: [ - "goodreads.com", - "gutenberg.org", - "openlibrary.org", - "barnesandnoble.com", - "indigo.ca", - ], - }, - { - id: "anime", - terms: ["anime", "manga"], - domains: [ - "myanimelist.net", - "anilist.co", - "kitsu.app", - "crunchyroll.com", - ], - }, - { - id: "music", - terms: ["music", "song", "songs", "album", "albums", "lyrics"], - domains: [ - "spotify.com", - "music.apple.com", - "soundcloud.com", - "bandcamp.com", - "music.youtube.com", - ], - }, - { - id: "podcasts", - terms: ["podcast", "podcasts"], - domains: [ - "podcasts.apple.com", - "overcast.fm", - "pocketcasts.com", - "castbox.fm", - ], - }, - { - id: "papers_research", - terms: [ - "paper", - "papers", - "research paper", - "research papers", - "academic paper", - "academic papers", - "journal", - "journals", - "study", - "studies", - "publication", - "publications", - ], - domains: [ - "scholar.google.com", - "arxiv.org", - "semanticscholar.org", - "pubmed.ncbi.nlm.nih.gov", - "researchgate.net", - "ieeexplore.ieee.org", - "dl.acm.org", - "springer.com", - "nature.com", - "science.org", - ], - }, - { - id: "tech_news", - terms: ["tech news", "technology news", "startup news"], - domains: [ - "theverge.com", - "techcrunch.com", - "wired.com", - "arstechnica.com", - "engadget.com", - ], - }, - { - id: "finance_news", - terms: ["finance news", "business news", "market news", "stock news"], - domains: [ - "bloomberg.com", - "wsj.com", - "ft.com", - "reuters.com", - "cnbc.com", - ], - }, - { - id: "news", - terms: [ - "news", - "headline", - "headlines", - "breaking news", - "world news", - "latest news", - ], - domains: [ - "reuters.com", - "apnews.com", - "bbc.com", - "cnn.com", - "nytimes.com", - "theguardian.com", - "washingtonpost.com", - "aljazeera.com", - "npr.org", - "wsj.com", - "bloomberg.com", - "ft.com", - ], - }, - { - id: "recipes", - terms: [ - "recipe", - "recipes", - "cooking", - "food", - "dinner ideas", - "meal prep", - ], - domains: [ - "allrecipes.com", - "seriouseats.com", - "foodnetwork.com", - "bbcgoodfood.com", - "epicurious.com", - "nytcooking.com", - ], - }, - { - id: "travel", - terms: ["travel", "hotels", "places", "destinations", "things to do"], - domains: [ - "tripadvisor.com", - "booking.com", - "expedia.com", - "airbnb.com", - "lonelyplanet.com", - ], - }, - ], -}; - -/** - * Normalizes a query string into a lowercase, space-separated form suitable for matching - * and comparison. - * - * @param {string} s - * @returns {string} - */ -function normalizeQuery(s) { - return (s || "") - .toLowerCase() - .replace(/[^\p{L}\p{N}]+/gu, " ") - .replace(/\s+/g, " ") - .trim(); -} - -/** - * Returns the matched category domains if searchTerm looks like a general category query. - * Uses phrase matching on normalized query string. - * - * @param {string} searchTerm - * @param {object} [categoriesJson=CATEGORIES_JSON] - * @returns {string[]|null} - */ -export function matchDomains(searchTerm, categoriesJson = CATEGORIES_JSON) { - const q = ` ${normalizeQuery(searchTerm)} `; - if (!q.trim()) { - return null; - } - - for (const cat of categoriesJson.categories) { - for (const t of cat.terms) { - // Pad with spaces to enable whole-token phrase matching via includes. - const tt = ` ${normalizeQuery(t)} `; - if (tt.trim() && q.includes(tt)) { - return cat.domains; - } - } - } - - return null; -} - -/** - * Builds a SQL WHERE clause for matching `http`/`https` URLs belonging - * to the given root domains and their `www` variants. - * - * @param {string[]} domains - * @returns {{ where: string, params: object }} - */ -function buildDomainUrlWhere(domains) { - const clauses = []; - const params = {}; - let i = 0; - - for (const raw of domains || []) { - const d = String(raw).toLowerCase(); - if (!d) { - continue; - } - - // - https://domain/... - // - https://www.domain/... - params[`d${i}`] = `%://${d}/%`; - clauses.push(`lower(url) LIKE :d${i++}`); - - params[`d${i}`] = `%://www.${d}/%`; - clauses.push(`lower(url) LIKE :d${i++}`); - } - - return { - where: clauses.length ? `(${clauses.join(" OR ")})` : "0", - params, - }; -} - -/** - * Domain-filtered moz_places query (time-windowed). - * - * @param {object} params - * @param {object} params.conn - * @param {string[]} params.domains - * @param {number|null} params.startTs - * @param {number|null} params.endTs - * @param {number} params.historyLimit - * @param {Function} params.buildHistoryRow - * @returns {Promise<object[]>} - */ -export async function searchByDomains({ - conn, - domains, - startTs, - endTs, - historyLimit, - buildHistoryRow, -}) { - if (!conn || !Array.isArray(domains) || !domains.length) { - return []; - } - - const { where, params } = buildDomainUrlWhere(domains); - - const results = await conn.executeCached( - ` - SELECT id, - title, - url, - NULL AS distance, - visit_count, - frecency, - last_visit_date, - preview_image_url - FROM moz_places - WHERE frecency <> 0 - AND (:startTs IS NULL OR last_visit_date >= :startTs) - AND (:endTs IS NULL OR last_visit_date <= :endTs) - AND ${where} - ORDER BY last_visit_date DESC, frecency DESC - LIMIT :limit - `, - { - startTs, - endTs, - limit: historyLimit, - ...params, - } - ); - - const rows = []; - for (const row of results) { - rows.push(await buildHistoryRow(row)); - } - return rows; -} - -/** - * Merge two result lists, keeping `primary` order, then topping up from `secondary`, - * while de-duping by url (fallback to id). - * - * @param {object[]} primary - * @param {object[]} secondary - * @param {number} limit - * @returns {object[]} - */ -export function mergeDedupe(primary, secondary, limit) { - const seen = new Set(); - const out = []; - - const keyOf = r => r?.url || r?.id; - - for (const r of primary || []) { - const k = keyOf(r); - if (!seen.has(k)) { - seen.add(k); - out.push(r); - if (out.length >= limit) { - return out; - } - } - } - - for (const r of secondary || []) { - const k = keyOf(r); - if (!seen.has(k)) { - seen.add(k); - out.push(r); - if (out.length >= limit) { - return out; - } - } - } - - return out; -} diff --git a/browser/components/aiwindow/models/moz.build b/browser/components/aiwindow/models/moz.build @@ -28,7 +28,6 @@ MOZ_SRC_FILES += [ "InsightsSchemas.sys.mjs", "IntentClassifier.sys.mjs", "SearchBrowsingHistory.sys.mjs", - "SearchBrowsingHistoryDomainBoost.sys.mjs", "TitleGeneration.sys.mjs", "Tools.sys.mjs", "Utils.sys.mjs", diff --git a/browser/components/aiwindow/models/tests/xpcshell/test_SearchBrowsingHistoryDomainBoost.js b/browser/components/aiwindow/models/tests/xpcshell/test_SearchBrowsingHistoryDomainBoost.js @@ -1,53 +0,0 @@ -/** - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. - */ - -const { matchDomains, mergeDedupe } = ChromeUtils.importESModule( - "moz-src:///browser/components/aiwindow/models/SearchBrowsingHistoryDomainBoost.sys.mjs" -); - -add_task(async function test_matchDomains_games_and_boundary_behavior() { - // Positive: should match games category - const domains = matchDomains("video games"); - Assert.ok( - domains?.includes("steampowered.com"), - "Should include steampowered.com for games" - ); - - // Negative: should not match substrings inside words ("endgame" should not trigger "game") - const domains2 = matchDomains("endgame"); - Assert.equal(domains2, null, "Should not match 'game' inside 'endgame'"); -}); - -add_task(async function test_matchDomains_prefers_longer_phrases() { - // "tech news" should match tech_news (not generic news) - const domains = matchDomains("tech news"); - Assert.ok( - domains?.includes("techcrunch.com"), - "Should match tech_news domains" - ); - Assert.ok( - !domains.includes("reuters.com"), - "Should not fall back to generic news domains" - ); -}); - -add_task(async function test_mergeDedupe_semantic_first_then_topup() { - const primary = [ - { id: 1, url: "https://example.com/a", title: "A" }, - { id: 2, url: "https://example.com/b", title: "B" }, - ]; - const secondary = [ - { id: 3, url: "https://example.com/b", title: "B dup" }, // dup by url - { id: 4, url: "https://example.com/c", title: "C" }, - ]; - - const out = mergeDedupe(primary, secondary, 10); - Assert.deepEqual( - out.map(r => r.url), - ["https://example.com/a", "https://example.com/b", "https://example.com/c"], - "Should keep primary order and de-dupe by url" - ); -}); diff --git a/browser/components/aiwindow/models/tests/xpcshell/xpcshell.toml b/browser/components/aiwindow/models/tests/xpcshell/xpcshell.toml @@ -28,8 +28,6 @@ support-files = [] ["test_SearchBrowsingHistory.js"] -["test_SearchBrowsingHistoryDomainBoost.js"] - ["test_TitleGeneration.js"] ["test_Tools_GetOpenTabs.js"]