commit b9eea0033f00d7f08df55efd3b5ac7facb2ab963
parent 092c54471a99f4494ab705618bbe7091469ed562
Author: frankjc2022 <frankjc2022@gmail.com>
Date: Mon, 29 Dec 2025 20:37:42 +0000
Bug 2006430 - Add workaround for general category queries r=tzhang,ai-models-reviewers
Differential Revision: https://phabricator.services.mozilla.com/D277615
Diffstat:
5 files changed, 480 insertions(+), 0 deletions(-)
diff --git a/browser/components/aiwindow/models/SearchBrowsingHistory.sys.mjs b/browser/components/aiwindow/models/SearchBrowsingHistory.sys.mjs
@@ -11,6 +11,9 @@ ChromeUtils.defineESModuleGetters(lazy, {
PlacesUtils: "resource://gre/modules/PlacesUtils.sys.mjs",
getPlacesSemanticHistoryManager:
"resource://gre/modules/PlacesSemanticHistoryManager.sys.mjs",
+ // Domain fallback / workaround for general-category queries (games, movies, etc.)
+ SearchBrowsingHistoryDomainBoost:
+ "resource://gre/modules/SearchBrowsingHistoryDomainBoost.sys.mjs",
});
/**
@@ -281,6 +284,31 @@ async function searchBrowsingHistorySemantic({
for (let row of results) {
rows.push(await buildHistoryRow(row));
}
+
+ // Domain fallback for general-category queries (games, movies, news, etc.)
+ // Keep semantic ranking primary, only top-up if we have room.
+ if (rows.length < historyLimit) {
+ const domains =
+ lazy.SearchBrowsingHistoryDomainBoost.matchDomains(searchTerm);
+ if (domains?.length) {
+ const domainRows =
+ await lazy.SearchBrowsingHistoryDomainBoost.searchByDomains({
+ conn,
+ domains,
+ startTs,
+ endTs,
+ historyLimit: Math.max(historyLimit * 2, 200), // extra for dedupe
+ buildHistoryRow,
+ });
+
+ return lazy.SearchBrowsingHistoryDomainBoost.mergeDedupe(
+ rows,
+ domainRows,
+ historyLimit
+ );
+ }
+ }
+
return rows;
}
diff --git a/browser/components/aiwindow/models/SearchBrowsingHistoryDomainBoost.sys.mjs b/browser/components/aiwindow/models/SearchBrowsingHistoryDomainBoost.sys.mjs
@@ -0,0 +1,396 @@
+/**
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+/**
+ * SearchBrowsingHistoryDomainBoost
+ *
+ * Temporary heuristic for general-category queries (games, movies, news, etc.)
+ * when semantic embeddings over title/description are insufficient.
+ *
+ * Safe to remove once richer embeddings or better intent classification lands.
+ */
+
+export const CATEGORIES_JSON = {
+ language: "en",
+ categories: [
+ {
+ id: "games",
+ terms: [
+ "game",
+ "games",
+ "video game",
+ "video games",
+ "pc games",
+ "console games",
+ ],
+ domains: [
+ "steampowered.com",
+ "roblox.com",
+ "ign.com",
+ "gamespot.com",
+ "polygon.com",
+ "metacritic.com",
+ "epicgames.com",
+ "store.playstation.com",
+ "xbox.com",
+ "nintendo.com",
+ ],
+ },
+ {
+ id: "movies",
+ terms: ["movie", "movies", "film", "films", "cinema"],
+ domains: [
+ "imdb.com",
+ "rottentomatoes.com",
+ "metacritic.com",
+ "letterboxd.com",
+ "netflix.com",
+ "primevideo.com",
+ "disneyplus.com",
+ "hulu.com",
+ "max.com",
+ ],
+ },
+ {
+ id: "tv",
+ terms: ["tv show", "tv shows", "show", "shows", "series", "tv series"],
+ domains: [
+ "imdb.com",
+ "rottentomatoes.com",
+ "metacritic.com",
+ "tvmaze.com",
+ "thetvdb.com",
+ "netflix.com",
+ "primevideo.com",
+ "disneyplus.com",
+ "hulu.com",
+ "max.com",
+ ],
+ },
+ {
+ id: "books",
+ terms: ["book", "books", "novel", "novels"],
+ domains: [
+ "goodreads.com",
+ "gutenberg.org",
+ "openlibrary.org",
+ "barnesandnoble.com",
+ "indigo.ca",
+ ],
+ },
+ {
+ id: "anime",
+ terms: ["anime", "manga"],
+ domains: [
+ "myanimelist.net",
+ "anilist.co",
+ "kitsu.app",
+ "crunchyroll.com",
+ ],
+ },
+ {
+ id: "music",
+ terms: ["music", "song", "songs", "album", "albums", "lyrics"],
+ domains: [
+ "spotify.com",
+ "music.apple.com",
+ "soundcloud.com",
+ "bandcamp.com",
+ "music.youtube.com",
+ ],
+ },
+ {
+ id: "podcasts",
+ terms: ["podcast", "podcasts"],
+ domains: [
+ "podcasts.apple.com",
+ "overcast.fm",
+ "pocketcasts.com",
+ "castbox.fm",
+ ],
+ },
+ {
+ id: "papers_research",
+ terms: [
+ "paper",
+ "papers",
+ "research paper",
+ "research papers",
+ "academic paper",
+ "academic papers",
+ "journal",
+ "journals",
+ "study",
+ "studies",
+ "publication",
+ "publications",
+ ],
+ domains: [
+ "scholar.google.com",
+ "arxiv.org",
+ "semanticscholar.org",
+ "pubmed.ncbi.nlm.nih.gov",
+ "researchgate.net",
+ "ieeexplore.ieee.org",
+ "dl.acm.org",
+ "springer.com",
+ "nature.com",
+ "science.org",
+ ],
+ },
+ {
+ id: "tech_news",
+ terms: ["tech news", "technology news", "startup news"],
+ domains: [
+ "theverge.com",
+ "techcrunch.com",
+ "wired.com",
+ "arstechnica.com",
+ "engadget.com",
+ ],
+ },
+ {
+ id: "finance_news",
+ terms: ["finance news", "business news", "market news", "stock news"],
+ domains: [
+ "bloomberg.com",
+ "wsj.com",
+ "ft.com",
+ "reuters.com",
+ "cnbc.com",
+ ],
+ },
+ {
+ id: "news",
+ terms: [
+ "news",
+ "headline",
+ "headlines",
+ "breaking news",
+ "world news",
+ "latest news",
+ ],
+ domains: [
+ "reuters.com",
+ "apnews.com",
+ "bbc.com",
+ "cnn.com",
+ "nytimes.com",
+ "theguardian.com",
+ "washingtonpost.com",
+ "aljazeera.com",
+ "npr.org",
+ "wsj.com",
+ "bloomberg.com",
+ "ft.com",
+ ],
+ },
+ {
+ id: "recipes",
+ terms: [
+ "recipe",
+ "recipes",
+ "cooking",
+ "food",
+ "dinner ideas",
+ "meal prep",
+ ],
+ domains: [
+ "allrecipes.com",
+ "seriouseats.com",
+ "foodnetwork.com",
+ "bbcgoodfood.com",
+ "epicurious.com",
+ "nytcooking.com",
+ ],
+ },
+ {
+ id: "travel",
+ terms: ["travel", "hotels", "places", "destinations", "things to do"],
+ domains: [
+ "tripadvisor.com",
+ "booking.com",
+ "expedia.com",
+ "airbnb.com",
+ "lonelyplanet.com",
+ ],
+ },
+ ],
+};
+
+/**
+ * Normalizes a query string into a lowercase, space-separated form suitable for matching
+ * and comparison.
+ *
+ * @param {string} s
+ * @returns {string}
+ */
+function normalizeQuery(s) {
+ return (s || "")
+ .toLowerCase()
+ .replace(/[^\p{L}\p{N}]+/gu, " ")
+ .replace(/\s+/g, " ")
+ .trim();
+}
+
+/**
+ * Returns the matched category domains if searchTerm looks like a general category query.
+ * Uses phrase matching on normalized query string.
+ *
+ * @param {string} searchTerm
+ * @param {object} [categoriesJson=CATEGORIES_JSON]
+ * @returns {string[]|null}
+ */
+export function matchDomains(searchTerm, categoriesJson = CATEGORIES_JSON) {
+ const q = ` ${normalizeQuery(searchTerm)} `;
+ if (!q.trim()) {
+ return null;
+ }
+
+ for (const cat of categoriesJson.categories) {
+ for (const t of cat.terms) {
+ // Pad with spaces to enable whole-token phrase matching via includes.
+ const tt = ` ${normalizeQuery(t)} `;
+ if (tt.trim() && q.includes(tt)) {
+ return cat.domains;
+ }
+ }
+ }
+
+ return null;
+}
+
+/**
+ * Builds a SQL WHERE clause for matching `http`/`https` URLs belonging
+ * to the given root domains and their `www` variants.
+ *
+ * @param {string[]} domains
+ * @returns {{ where: string, params: object }}
+ */
+function buildDomainUrlWhere(domains) {
+ const clauses = [];
+ const params = {};
+ let i = 0;
+
+ for (const raw of domains || []) {
+ const d = String(raw).toLowerCase();
+ if (!d) {
+ continue;
+ }
+
+ // - https://domain/...
+ // - https://www.domain/...
+ params[`d${i}`] = `%://${d}/%`;
+ clauses.push(`lower(url) LIKE :d${i++}`);
+
+ params[`d${i}`] = `%://www.${d}/%`;
+ clauses.push(`lower(url) LIKE :d${i++}`);
+ }
+
+ return {
+ where: clauses.length ? `(${clauses.join(" OR ")})` : "0",
+ params,
+ };
+}
+
+/**
+ * Domain-filtered moz_places query (time-windowed).
+ *
+ * @param {object} params
+ * @param {object} params.conn
+ * @param {string[]} params.domains
+ * @param {number|null} params.startTs
+ * @param {number|null} params.endTs
+ * @param {number} params.historyLimit
+ * @param {Function} params.buildHistoryRow
+ * @returns {Promise<object[]>}
+ */
+export async function searchByDomains({
+ conn,
+ domains,
+ startTs,
+ endTs,
+ historyLimit,
+ buildHistoryRow,
+}) {
+ if (!conn || !Array.isArray(domains) || !domains.length) {
+ return [];
+ }
+
+ const { where, params } = buildDomainUrlWhere(domains);
+
+ const results = await conn.executeCached(
+ `
+ SELECT id,
+ title,
+ url,
+ NULL AS distance,
+ visit_count,
+ frecency,
+ last_visit_date,
+ preview_image_url
+ FROM moz_places
+ WHERE frecency <> 0
+ AND (:startTs IS NULL OR last_visit_date >= :startTs)
+ AND (:endTs IS NULL OR last_visit_date <= :endTs)
+ AND ${where}
+ ORDER BY last_visit_date DESC, frecency DESC
+ LIMIT :limit
+ `,
+ {
+ startTs,
+ endTs,
+ limit: historyLimit,
+ ...params,
+ }
+ );
+
+ const rows = [];
+ for (const row of results) {
+ rows.push(await buildHistoryRow(row));
+ }
+ return rows;
+}
+
+/**
+ * Merge two result lists, keeping `primary` order, then topping up from `secondary`,
+ * while de-duping by url (fallback to id).
+ *
+ * @param {object[]} primary
+ * @param {object[]} secondary
+ * @param {number} limit
+ * @returns {object[]}
+ */
+export function mergeDedupe(primary, secondary, limit) {
+ const seen = new Set();
+ const out = [];
+
+ const keyOf = r => r?.url || r?.id;
+
+ for (const r of primary || []) {
+ const k = keyOf(r);
+ if (!seen.has(k)) {
+ seen.add(k);
+ out.push(r);
+ if (out.length >= limit) {
+ return out;
+ }
+ }
+ }
+
+ for (const r of secondary || []) {
+ const k = keyOf(r);
+ if (!seen.has(k)) {
+ seen.add(k);
+ out.push(r);
+ if (out.length >= limit) {
+ return out;
+ }
+ }
+ }
+
+ return out;
+}
diff --git a/browser/components/aiwindow/models/moz.build b/browser/components/aiwindow/models/moz.build
@@ -28,6 +28,7 @@ MOZ_SRC_FILES += [
"InsightsSchemas.sys.mjs",
"IntentClassifier.sys.mjs",
"SearchBrowsingHistory.sys.mjs",
+ "SearchBrowsingHistoryDomainBoost.sys.mjs",
"TitleGeneration.sys.mjs",
"Tools.sys.mjs",
"Utils.sys.mjs",
diff --git a/browser/components/aiwindow/models/tests/xpcshell/test_SearchBrowsingHistoryDomainBoost.js b/browser/components/aiwindow/models/tests/xpcshell/test_SearchBrowsingHistoryDomainBoost.js
@@ -0,0 +1,53 @@
+/**
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+const { matchDomains, mergeDedupe } = ChromeUtils.importESModule(
+ "moz-src:///browser/components/aiwindow/models/SearchBrowsingHistoryDomainBoost.sys.mjs"
+);
+
+add_task(async function test_matchDomains_games_and_boundary_behavior() {
+ // Positive: should match games category
+ const domains = matchDomains("video games");
+ Assert.ok(
+ domains?.includes("steampowered.com"),
+ "Should include steampowered.com for games"
+ );
+
+ // Negative: should not match substrings inside words ("endgame" should not trigger "game")
+ const domains2 = matchDomains("endgame");
+ Assert.equal(domains2, null, "Should not match 'game' inside 'endgame'");
+});
+
+add_task(async function test_matchDomains_prefers_longer_phrases() {
+ // "tech news" should match tech_news (not generic news)
+ const domains = matchDomains("tech news");
+ Assert.ok(
+ domains?.includes("techcrunch.com"),
+ "Should match tech_news domains"
+ );
+ Assert.ok(
+ !domains.includes("reuters.com"),
+ "Should not fall back to generic news domains"
+ );
+});
+
+add_task(async function test_mergeDedupe_semantic_first_then_topup() {
+ const primary = [
+ { id: 1, url: "https://example.com/a", title: "A" },
+ { id: 2, url: "https://example.com/b", title: "B" },
+ ];
+ const secondary = [
+ { id: 3, url: "https://example.com/b", title: "B dup" }, // dup by url
+ { id: 4, url: "https://example.com/c", title: "C" },
+ ];
+
+ const out = mergeDedupe(primary, secondary, 10);
+ Assert.deepEqual(
+ out.map(r => r.url),
+ ["https://example.com/a", "https://example.com/b", "https://example.com/c"],
+ "Should keep primary order and de-dupe by url"
+ );
+});
diff --git a/browser/components/aiwindow/models/tests/xpcshell/xpcshell.toml b/browser/components/aiwindow/models/tests/xpcshell/xpcshell.toml
@@ -28,6 +28,8 @@ support-files = []
["test_SearchBrowsingHistory.js"]
+["test_SearchBrowsingHistoryDomainBoost.js"]
+
["test_TitleGeneration.js"]
["test_Tools_GetOpenTabs.js"]