[ tor-browser ].git.dasho

commit b9eea0033f00d7f08df55efd3b5ac7facb2ab963
parent 092c54471a99f4494ab705618bbe7091469ed562
Author: frankjc2022 <frankjc2022@gmail.com>
Date:   Mon, 29 Dec 2025 20:37:42 +0000

Bug 2006430 - Add workaround for general category queries r=tzhang,ai-models-reviewers

Differential Revision: https://phabricator.services.mozilla.com/D277615

Diffstat:
M browser/components/aiwindow/models/SearchBrowsingHistory.sys.mjs  | 28 ++++++++++++++++++++++++++++
A browser/components/aiwindow/models/SearchBrowsingHistoryDomainBoost.sys.mjs  | 396 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M browser/components/aiwindow/models/moz.build  | 1 +
A browser/components/aiwindow/models/tests/xpcshell/test_SearchBrowsingHistoryDomainBoost.js  | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
M browser/components/aiwindow/models/tests/xpcshell/xpcshell.toml  | 2 ++

5 files changed, 480 insertions(+), 0 deletions(-)
diff --git a/browser/components/aiwindow/models/SearchBrowsingHistory.sys.mjs b/browser/components/aiwindow/models/SearchBrowsingHistory.sys.mjs
@@ -11,6 +11,9 @@ ChromeUtils.defineESModuleGetters(lazy, {
   PlacesUtils: "resource://gre/modules/PlacesUtils.sys.mjs",
   getPlacesSemanticHistoryManager:
     "resource://gre/modules/PlacesSemanticHistoryManager.sys.mjs",
+  // Domain fallback / workaround for general-category queries (games, movies, etc.)
+  SearchBrowsingHistoryDomainBoost:
+    "resource://gre/modules/SearchBrowsingHistoryDomainBoost.sys.mjs",
 });
 
 /**
@@ -281,6 +284,31 @@ async function searchBrowsingHistorySemantic({
   for (let row of results) {
     rows.push(await buildHistoryRow(row));
   }
+
+  // Domain fallback for general-category queries (games, movies, news, etc.)
+  // Keep semantic ranking primary, only top-up if we have room.
+  if (rows.length < historyLimit) {
+    const domains =
+      lazy.SearchBrowsingHistoryDomainBoost.matchDomains(searchTerm);
+    if (domains?.length) {
+      const domainRows =
+        await lazy.SearchBrowsingHistoryDomainBoost.searchByDomains({
+          conn,
+          domains,
+          startTs,
+          endTs,
+          historyLimit: Math.max(historyLimit * 2, 200), // extra for dedupe
+          buildHistoryRow,
+        });
+
+      return lazy.SearchBrowsingHistoryDomainBoost.mergeDedupe(
+        rows,
+        domainRows,
+        historyLimit
+      );
+    }
+  }
+
   return rows;
 }
 
diff --git a/browser/components/aiwindow/models/SearchBrowsingHistoryDomainBoost.sys.mjs b/browser/components/aiwindow/models/SearchBrowsingHistoryDomainBoost.sys.mjs
@@ -0,0 +1,396 @@
+/**
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+/**
+ * SearchBrowsingHistoryDomainBoost
+ *
+ * Temporary heuristic for general-category queries (games, movies, news, etc.)
+ * when semantic embeddings over title/description are insufficient.
+ *
+ * Safe to remove once richer embeddings or better intent classification lands.
+ */
+
+export const CATEGORIES_JSON = {
+  language: "en",
+  categories: [
+    {
+      id: "games",
+      terms: [
+        "game",
+        "games",
+        "video game",
+        "video games",
+        "pc games",
+        "console games",
+      ],
+      domains: [
+        "steampowered.com",
+        "roblox.com",
+        "ign.com",
+        "gamespot.com",
+        "polygon.com",
+        "metacritic.com",
+        "epicgames.com",
+        "store.playstation.com",
+        "xbox.com",
+        "nintendo.com",
+      ],
+    },
+    {
+      id: "movies",
+      terms: ["movie", "movies", "film", "films", "cinema"],
+      domains: [
+        "imdb.com",
+        "rottentomatoes.com",
+        "metacritic.com",
+        "letterboxd.com",
+        "netflix.com",
+        "primevideo.com",
+        "disneyplus.com",
+        "hulu.com",
+        "max.com",
+      ],
+    },
+    {
+      id: "tv",
+      terms: ["tv show", "tv shows", "show", "shows", "series", "tv series"],
+      domains: [
+        "imdb.com",
+        "rottentomatoes.com",
+        "metacritic.com",
+        "tvmaze.com",
+        "thetvdb.com",
+        "netflix.com",
+        "primevideo.com",
+        "disneyplus.com",
+        "hulu.com",
+        "max.com",
+      ],
+    },
+    {
+      id: "books",
+      terms: ["book", "books", "novel", "novels"],
+      domains: [
+        "goodreads.com",
+        "gutenberg.org",
+        "openlibrary.org",
+        "barnesandnoble.com",
+        "indigo.ca",
+      ],
+    },
+    {
+      id: "anime",
+      terms: ["anime", "manga"],
+      domains: [
+        "myanimelist.net",
+        "anilist.co",
+        "kitsu.app",
+        "crunchyroll.com",
+      ],
+    },
+    {
+      id: "music",
+      terms: ["music", "song", "songs", "album", "albums", "lyrics"],
+      domains: [
+        "spotify.com",
+        "music.apple.com",
+        "soundcloud.com",
+        "bandcamp.com",
+        "music.youtube.com",
+      ],
+    },
+    {
+      id: "podcasts",
+      terms: ["podcast", "podcasts"],
+      domains: [
+        "podcasts.apple.com",
+        "overcast.fm",
+        "pocketcasts.com",
+        "castbox.fm",
+      ],
+    },
+    {
+      id: "papers_research",
+      terms: [
+        "paper",
+        "papers",
+        "research paper",
+        "research papers",
+        "academic paper",
+        "academic papers",
+        "journal",
+        "journals",
+        "study",
+        "studies",
+        "publication",
+        "publications",
+      ],
+      domains: [
+        "scholar.google.com",
+        "arxiv.org",
+        "semanticscholar.org",
+        "pubmed.ncbi.nlm.nih.gov",
+        "researchgate.net",
+        "ieeexplore.ieee.org",
+        "dl.acm.org",
+        "springer.com",
+        "nature.com",
+        "science.org",
+      ],
+    },
+    {
+      id: "tech_news",
+      terms: ["tech news", "technology news", "startup news"],
+      domains: [
+        "theverge.com",
+        "techcrunch.com",
+        "wired.com",
+        "arstechnica.com",
+        "engadget.com",
+      ],
+    },
+    {
+      id: "finance_news",
+      terms: ["finance news", "business news", "market news", "stock news"],
+      domains: [
+        "bloomberg.com",
+        "wsj.com",
+        "ft.com",
+        "reuters.com",
+        "cnbc.com",
+      ],
+    },
+    {
+      id: "news",
+      terms: [
+        "news",
+        "headline",
+        "headlines",
+        "breaking news",
+        "world news",
+        "latest news",
+      ],
+      domains: [
+        "reuters.com",
+        "apnews.com",
+        "bbc.com",
+        "cnn.com",
+        "nytimes.com",
+        "theguardian.com",
+        "washingtonpost.com",
+        "aljazeera.com",
+        "npr.org",
+        "wsj.com",
+        "bloomberg.com",
+        "ft.com",
+      ],
+    },
+    {
+      id: "recipes",
+      terms: [
+        "recipe",
+        "recipes",
+        "cooking",
+        "food",
+        "dinner ideas",
+        "meal prep",
+      ],
+      domains: [
+        "allrecipes.com",
+        "seriouseats.com",
+        "foodnetwork.com",
+        "bbcgoodfood.com",
+        "epicurious.com",
+        "nytcooking.com",
+      ],
+    },
+    {
+      id: "travel",
+      terms: ["travel", "hotels", "places", "destinations", "things to do"],
+      domains: [
+        "tripadvisor.com",
+        "booking.com",
+        "expedia.com",
+        "airbnb.com",
+        "lonelyplanet.com",
+      ],
+    },
+  ],
+};
+
+/**
+ * Normalizes a query string into a lowercase, space-separated form suitable for matching
+ * and comparison.
+ *
+ * @param {string} s
+ * @returns {string}
+ */
+function normalizeQuery(s) {
+  return (s || "")
+    .toLowerCase()
+    .replace(/[^\p{L}\p{N}]+/gu, " ")
+    .replace(/\s+/g, " ")
+    .trim();
+}
+
+/**
+ * Returns the matched category domains if searchTerm looks like a general category query.
+ * Uses phrase matching on normalized query string.
+ *
+ * @param {string} searchTerm
+ * @param {object} [categoriesJson=CATEGORIES_JSON]
+ * @returns {string[]|null}
+ */
+export function matchDomains(searchTerm, categoriesJson = CATEGORIES_JSON) {
+  const q = ` ${normalizeQuery(searchTerm)} `;
+  if (!q.trim()) {
+    return null;
+  }
+
+  for (const cat of categoriesJson.categories) {
+    for (const t of cat.terms) {
+      // Pad with spaces to enable whole-token phrase matching via includes.
+      const tt = ` ${normalizeQuery(t)} `;
+      if (tt.trim() && q.includes(tt)) {
+        return cat.domains;
+      }
+    }
+  }
+
+  return null;
+}
+
+/**
+ * Builds a SQL WHERE clause for matching `http`/`https` URLs belonging
+ * to the given root domains and their `www` variants.
+ *
+ * @param {string[]} domains
+ * @returns {{ where: string, params: object }}
+ */
+function buildDomainUrlWhere(domains) {
+  const clauses = [];
+  const params = {};
+  let i = 0;
+
+  for (const raw of domains || []) {
+    const d = String(raw).toLowerCase();
+    if (!d) {
+      continue;
+    }
+
+    // - https://domain/...
+    // - https://www.domain/...
+    params[`d${i}`] = `%://${d}/%`;
+    clauses.push(`lower(url) LIKE :d${i++}`);
+
+    params[`d${i}`] = `%://www.${d}/%`;
+    clauses.push(`lower(url) LIKE :d${i++}`);
+  }
+
+  return {
+    where: clauses.length ? `(${clauses.join(" OR ")})` : "0",
+    params,
+  };
+}
+
+/**
+ * Domain-filtered moz_places query (time-windowed).
+ *
+ * @param {object} params
+ * @param {object} params.conn
+ * @param {string[]} params.domains
+ * @param {number|null} params.startTs
+ * @param {number|null} params.endTs
+ * @param {number} params.historyLimit
+ * @param {Function} params.buildHistoryRow
+ * @returns {Promise<object[]>}
+ */
+export async function searchByDomains({
+  conn,
+  domains,
+  startTs,
+  endTs,
+  historyLimit,
+  buildHistoryRow,
+}) {
+  if (!conn || !Array.isArray(domains) || !domains.length) {
+    return [];
+  }
+
+  const { where, params } = buildDomainUrlWhere(domains);
+
+  const results = await conn.executeCached(
+    `
+      SELECT id,
+             title,
+             url,
+             NULL AS distance,
+             visit_count,
+             frecency,
+             last_visit_date,
+             preview_image_url
+      FROM moz_places
+      WHERE frecency <> 0
+        AND (:startTs IS NULL OR last_visit_date >= :startTs)
+        AND (:endTs IS NULL OR last_visit_date <= :endTs)
+        AND ${where}
+      ORDER BY last_visit_date DESC, frecency DESC
+      LIMIT :limit
+    `,
+    {
+      startTs,
+      endTs,
+      limit: historyLimit,
+      ...params,
+    }
+  );
+
+  const rows = [];
+  for (const row of results) {
+    rows.push(await buildHistoryRow(row));
+  }
+  return rows;
+}
+
+/**
+ * Merge two result lists, keeping `primary` order, then topping up from `secondary`,
+ * while de-duping by url (fallback to id).
+ *
+ * @param {object[]} primary
+ * @param {object[]} secondary
+ * @param {number} limit
+ * @returns {object[]}
+ */
+export function mergeDedupe(primary, secondary, limit) {
+  const seen = new Set();
+  const out = [];
+
+  const keyOf = r => r?.url || r?.id;
+
+  for (const r of primary || []) {
+    const k = keyOf(r);
+    if (!seen.has(k)) {
+      seen.add(k);
+      out.push(r);
+      if (out.length >= limit) {
+        return out;
+      }
+    }
+  }
+
+  for (const r of secondary || []) {
+    const k = keyOf(r);
+    if (!seen.has(k)) {
+      seen.add(k);
+      out.push(r);
+      if (out.length >= limit) {
+        return out;
+      }
+    }
+  }
+
+  return out;
+}
diff --git a/browser/components/aiwindow/models/moz.build b/browser/components/aiwindow/models/moz.build
@@ -28,6 +28,7 @@ MOZ_SRC_FILES += [
     "InsightsSchemas.sys.mjs",
     "IntentClassifier.sys.mjs",
     "SearchBrowsingHistory.sys.mjs",
+    "SearchBrowsingHistoryDomainBoost.sys.mjs",
     "TitleGeneration.sys.mjs",
     "Tools.sys.mjs",
     "Utils.sys.mjs",
diff --git a/browser/components/aiwindow/models/tests/xpcshell/test_SearchBrowsingHistoryDomainBoost.js b/browser/components/aiwindow/models/tests/xpcshell/test_SearchBrowsingHistoryDomainBoost.js
@@ -0,0 +1,53 @@
+/**
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+const { matchDomains, mergeDedupe } = ChromeUtils.importESModule(
+  "moz-src:///browser/components/aiwindow/models/SearchBrowsingHistoryDomainBoost.sys.mjs"
+);
+
+add_task(async function test_matchDomains_games_and_boundary_behavior() {
+  // Positive: should match games category
+  const domains = matchDomains("video games");
+  Assert.ok(
+    domains?.includes("steampowered.com"),
+    "Should include steampowered.com for games"
+  );
+
+  // Negative: should not match substrings inside words ("endgame" should not trigger "game")
+  const domains2 = matchDomains("endgame");
+  Assert.equal(domains2, null, "Should not match 'game' inside 'endgame'");
+});
+
+add_task(async function test_matchDomains_prefers_longer_phrases() {
+  // "tech news" should match tech_news (not generic news)
+  const domains = matchDomains("tech news");
+  Assert.ok(
+    domains?.includes("techcrunch.com"),
+    "Should match tech_news domains"
+  );
+  Assert.ok(
+    !domains.includes("reuters.com"),
+    "Should not fall back to generic news domains"
+  );
+});
+
+add_task(async function test_mergeDedupe_semantic_first_then_topup() {
+  const primary = [
+    { id: 1, url: "https://example.com/a", title: "A" },
+    { id: 2, url: "https://example.com/b", title: "B" },
+  ];
+  const secondary = [
+    { id: 3, url: "https://example.com/b", title: "B dup" }, // dup by url
+    { id: 4, url: "https://example.com/c", title: "C" },
+  ];
+
+  const out = mergeDedupe(primary, secondary, 10);
+  Assert.deepEqual(
+    out.map(r => r.url),
+    ["https://example.com/a", "https://example.com/b", "https://example.com/c"],
+    "Should keep primary order and de-dupe by url"
+  );
+});
diff --git a/browser/components/aiwindow/models/tests/xpcshell/xpcshell.toml b/browser/components/aiwindow/models/tests/xpcshell/xpcshell.toml
@@ -28,6 +28,8 @@ support-files = []
 
 ["test_SearchBrowsingHistory.js"]
 
+["test_SearchBrowsingHistoryDomainBoost.js"]
+
 ["test_TitleGeneration.js"]
 
 ["test_Tools_GetOpenTabs.js"]

	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE

M	browser/components/aiwindow/models/SearchBrowsingHistory.sys.mjs	\|	28	++++++++++++++++++++++++++++
A	browser/components/aiwindow/models/SearchBrowsingHistoryDomainBoost.sys.mjs	\|	396	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	browser/components/aiwindow/models/moz.build	\|	1	+
A	browser/components/aiwindow/models/tests/xpcshell/test_SearchBrowsingHistoryDomainBoost.js	\|	53	+++++++++++++++++++++++++++++++++++++++++++++++++++++
M	browser/components/aiwindow/models/tests/xpcshell/xpcshell.toml	\|	2	++