[ tor-browser ].git.dasho

commit 22284e85a1de490c37bb63364be4a3cb03c6657a
parent a3565941a5eee04c452674570e64ab55274ab869
Author: Tzu-An Liu <tliu@mozilla.com>
Date:   Fri, 21 Nov 2025 20:39:03 +0000

Bug 2000945 - Move query intent detection to AI-window r=Mardak,ai-models-reviewers

Differential Revision: https://phabricator.services.mozilla.com/D273259

Diffstat:
A browser/components/aiwindow/models/IntentClassifier.sys.mjs  | 235 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M browser/components/aiwindow/models/moz.build  | 6 ++++++
A browser/components/aiwindow/models/tests/xpcshell/test_intent_classifier.js  | 303 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A browser/components/aiwindow/models/tests/xpcshell/xpcshell.toml  | 5 +++++

4 files changed, 549 insertions(+), 0 deletions(-)
diff --git a/browser/components/aiwindow/models/IntentClassifier.sys.mjs b/browser/components/aiwindow/models/IntentClassifier.sys.mjs
@@ -0,0 +1,235 @@
+/**
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+import { createEngine } from "chrome://global/content/ml/EngineProcess.sys.mjs";
+
+const FORCED_CHAT_PHRASES = [
+  "amuse me",
+  "are we alone",
+  "are you alive",
+  "are you gpt",
+  "are you human",
+  "are you real",
+  "bark like dog",
+  "cheer me up",
+  "comfort me",
+  "count numbers",
+  "curse me",
+  "do aliens exist",
+  "do we matter",
+  "do you dream",
+  "do you think",
+  "does fate exist",
+  "dream meaning",
+  "drop wisdom",
+  "encourage me",
+  "entertain me",
+  "explain yourself",
+  "flip coin",
+  "give blessing",
+  "give wisdom",
+  "good morning",
+  "good night",
+  "guess number",
+  "hallo",
+  "hello",
+  "hey",
+  "hi",
+  "hola",
+  "how are you",
+  "inspire me",
+  "invent a word",
+  "invent holiday",
+  "invent joke",
+  "is god real",
+  "life advice",
+  "life purpose",
+  "list animals",
+  "list capitals",
+  "list colors",
+  "list countries",
+  "list elements",
+  "list fruits",
+  "list metals",
+  "list oceans",
+  "list planets",
+  "list shapes",
+  "meaning of life",
+  "meow like cat",
+  "motivate me",
+  "now you are",
+  "play a game",
+  "pretend alien",
+  "pretend child",
+  "pretend detective",
+  "pretend ghost",
+  "pretend pirate",
+  "pretend robot",
+  "pretend superhero",
+  "pretend teacher",
+  "pretend wizard",
+  "random fact",
+  "random number",
+  "roll dice",
+  "goodbye",
+  "simulate chat",
+  "simulate future",
+  "simulate past",
+  "sing like robot",
+  "sing lullaby",
+  "sing rap",
+  "sup",
+  "surprise me",
+  "teach me",
+  "tell bedtime story",
+  "tell fortune",
+  "tell joke",
+  "tell prophecy",
+  "tell riddle",
+  "tell story",
+  "what is art",
+  "what is beauty",
+  "what is death",
+  "what is freedom",
+  "what is justice",
+  "what is love",
+  "what is mind",
+  "what is reality",
+  "what is right",
+  "what is self",
+  "what is soul",
+  "what is time",
+  "what is truth",
+  "what is wrong",
+  "what model are you",
+  "what version",
+  "what’s up",
+  "which model are you",
+  "who am i",
+  "who are you",
+  "who made you",
+  "why are we",
+  "write a poem",
+  "write a song",
+  "write haiku",
+  "write quote",
+  "your model is",
+];
+
+export function normalizeTextForChatAllowlist(s) {
+  return s.toLowerCase().normalize("NFKC").replace(/\s+/g, " ").trim();
+}
+
+// Split on non-word chars; letters/numbers/_ are "word" characters
+export function tokenizeTextForChatAllowlist(s) {
+  return normalizeTextForChatAllowlist(s)
+    .split(/[^\p{L}\p{N}_]+/u)
+    .filter(Boolean);
+}
+
+export function buildChatAllowlist(phrases) {
+  const byLen = new Map(); // len -> Set("tok tok ...")
+  for (const p of phrases) {
+    const key = tokenizeTextForChatAllowlist(p).join(" ");
+    if (!key) {
+      continue;
+    }
+    const k = key.split(" ").length;
+    if (!byLen.has(k)) {
+      byLen.set(k, new Set());
+    }
+    byLen.get(k).add(key);
+  }
+  return byLen;
+}
+
+// Factory: returns a fast checker for “does query contain any isolated phrase?”
+export function makeIsolatedPhraseChecker(phrases) {
+  const byLen = buildChatAllowlist(phrases);
+  const cache = new Map();
+
+  return function containsIsolatedPhrase(query) {
+    const qNorm = normalizeTextForChatAllowlist(query);
+    if (cache.has(qNorm)) {
+      return cache.get(qNorm);
+    }
+
+    const toks = qNorm.split(/[^\p{L}\p{N}_]+/u).filter(Boolean);
+    for (const [k, set] of byLen) {
+      for (let i = 0; i + k <= toks.length; i++) {
+        if (set.has(toks.slice(i, i + k).join(" "))) {
+          cache.set(qNorm, true);
+          return true;
+        }
+      }
+    }
+    cache.set(qNorm, false);
+    return false;
+  };
+}
+
+/**
+ * Intent Classifier Engine
+ */
+export const IntentClassifier = {
+  /**
+   * Exposing createEngine for testing purposes.
+   */
+
+  _createEngine: createEngine,
+
+  /**
+   * Initialize forced-chat checker at module load.
+   * Keeping it as a property ensures easy stubbing in tests.
+   */
+
+  _isForcedChat: makeIsolatedPhraseChecker(FORCED_CHAT_PHRASES),
+
+  /**
+   * Gets the intent of the prompt using a text classification model.
+   *
+   * @param {string} prompt
+   * @returns {string} "search" | "chat"
+   */
+
+  async getPromptIntent(query) {
+    try {
+      const cleanedQuery = this._preprocessQuery(query);
+      if (this._isForcedChat(cleanedQuery)) {
+        return "chat";
+      }
+      const engine = await this._createEngine({
+        featureId: "smart-intent",
+        modelId: "mozilla/mobilebert-query-intent-detection",
+        modelRevision: "v0.2.0",
+        taskName: "text-classification",
+      });
+      const threshold = 0.8;
+      const resp = await engine.run({ args: [[cleanedQuery]] });
+      // resp example: [{ label: "chat", score: 0.95 }, { label: "search", score: 0.04 }]
+      if (
+        resp[0].label.toLowerCase() === "chat" &&
+        resp[0].score >= threshold
+      ) {
+        return "chat";
+      }
+      return "search";
+    } catch (error) {
+      console.error("Error using intent detection model:", error);
+      throw error;
+    }
+  },
+
+  // Helper function for preprocessing text input
+  _preprocessQuery(query) {
+    if (typeof query !== "string") {
+      throw new TypeError(
+        `Expected a string for query preprocessing, but received ${typeof query}`
+      );
+    }
+    return query.replace(/\?/g, "").trim();
+  },
+};
diff --git a/browser/components/aiwindow/models/moz.build b/browser/components/aiwindow/models/moz.build
@@ -4,3 +4,9 @@
 
 with Files("**"):
     BUG_COMPONENT = ("Core", "Machine Learning: General")
+
+MOZ_SRC_FILES += [
+    "IntentClassifier.sys.mjs",
+]
+
+XPCSHELL_TESTS_MANIFESTS += ["tests/xpcshell/xpcshell.toml"]
diff --git a/browser/components/aiwindow/models/tests/xpcshell/test_intent_classifier.js b/browser/components/aiwindow/models/tests/xpcshell/test_intent_classifier.js
@@ -0,0 +1,303 @@
+/* Any copyright is dedicated to the Public Domain.
+ * http://creativecommons.org/publicdomain/zero/1.0/ */
+
+const {
+  IntentClassifier,
+  normalizeTextForChatAllowlist,
+  tokenizeTextForChatAllowlist,
+  buildChatAllowlist,
+  makeIsolatedPhraseChecker,
+} = ChromeUtils.importESModule(
+  "moz-src:///browser/components/aiwindow/models/IntentClassifier.sys.mjs"
+);
+
+const { sinon } = ChromeUtils.importESModule(
+  "resource://testing-common/Sinon.sys.mjs"
+);
+
+add_task(async function test_getPromptIntent_basic() {
+  const sb = sinon.createSandbox();
+  try {
+    const cases = [
+      { prompt: "please search for news on firefox", expected: "search" },
+      {
+        prompt: "Can you FIND me the docs for PageAssist?",
+        expected: "search",
+      }, // case-insensitive
+      { prompt: "look up the best pizza in SF", expected: "search" },
+      { prompt: "hello there, how are you?", expected: "chat" },
+      { prompt: "tell me a joke", expected: "chat" },
+    ];
+
+    const fakeEngine = {
+      run({ args: [[query]] }) {
+        const searchKeywords = [
+          "search",
+          "find",
+          "look",
+          "query",
+          "locate",
+          "explore",
+        ];
+        const formattedPrompt = query.toLowerCase();
+        const isSearch = searchKeywords.some(keyword =>
+          formattedPrompt.includes(keyword)
+        );
+
+        // Simulate model confidence scores
+        if (isSearch) {
+          return [
+            { label: "search", score: 0.95 },
+            { label: "chat", score: 0.05 },
+          ];
+        }
+        return [
+          { label: "chat", score: 0.95 },
+          { label: "search", score: 0.05 },
+        ];
+      },
+    };
+
+    sb.stub(IntentClassifier, "_createEngine").resolves(fakeEngine);
+
+    for (const { prompt, expected } of cases) {
+      const intent = await IntentClassifier.getPromptIntent(prompt);
+      Assert.equal(
+        intent,
+        expected,
+        `getPromptIntent("${prompt}") should return "${expected}"`
+      );
+    }
+  } finally {
+    sb.restore();
+  }
+});
+
+add_task(async function test_preprocessQuery_removes_question_marks() {
+  // Call the real helper on the classifier
+  const cases = [
+    { input: "hello?", expected: "hello" },
+    { input: "?prompt", expected: "prompt" },
+    { input: "multiple???", expected: "multiple" },
+    { input: "mid?dle", expected: "middle" },
+    { input: "question? ", expected: "question" },
+    { input: " no?  spaces? ", expected: "no  spaces" },
+    { input: "???", expected: "" },
+    { input: "clean input", expected: "clean input" },
+  ];
+
+  for (const { input, expected } of cases) {
+    const result = IntentClassifier._preprocessQuery(input);
+    Assert.equal(
+      result,
+      expected,
+      `Expected "${input}" to preprocess to "${expected}", got "${result}"`
+    );
+  }
+});
+
+add_task(function test_normalizeTextForChatAllowlist_basic() {
+  // lowercasing + trimming + collapsing internal spaces
+  Assert.equal(
+    normalizeTextForChatAllowlist("  HeLLo   There  "),
+    "hello there",
+    "Should lowercase, trim, and collapse spaces"
+  );
+
+  // NFKC normalization: compatibility forms → canonical
+  // Fullwidth characters normalize: e.g., 'ＴＥＳＴ' → 'test'
+  Assert.equal(
+    normalizeTextForChatAllowlist("ＴＥＳＴ  １２３"),
+    "test 123",
+    "Should NFKC-normalize fullwidth letters/digits"
+  );
+
+  // Multiple whitespace kinds (NBSP, tabs, newlines) collapse
+  Assert.equal(
+    normalizeTextForChatAllowlist("a\u00A0b\tc\nd"),
+    "a b c d",
+    "Should collapse all whitespace kinds to single spaces"
+  );
+});
+
+add_task(function test_tokenizeTextForChatAllowlist_unicode_and_boundaries() {
+  // Splits on non-word chars, keeps letters/digits/underscore
+  Assert.deepEqual(
+    tokenizeTextForChatAllowlist("hello, world! 42_times"),
+    ["hello", "world", "42_times"],
+    "Should split on punctuation and keep underscores"
+  );
+
+  // Unicode letters should be treated as word chars (\p{L})
+  Assert.deepEqual(
+    tokenizeTextForChatAllowlist("mañana—café!"),
+    ["mañana", "café"],
+    "Should keep Unicode letters and split on punctuation (em dash, bang)"
+  );
+
+  // Apostrophes split (non-word), as intended
+  Assert.deepEqual(
+    tokenizeTextForChatAllowlist("what's up"),
+    ["what", "s", "up"],
+    "Apostrophes are separators, so tokens split around them"
+  );
+});
+
+add_task(function test_buildChatAllowlist_grouping_and_normalization() {
+  const phrases = [
+    "sup",
+    "hi there", // 2 tokens
+    "what's up", // becomes "what s up" (3 tokens)
+    " foo   bar  ", // leading/trailing + multiple spaces
+    "", // empty should be skipped
+    "___", // token of underscores counts as 1 token
+  ];
+  const sets = buildChatAllowlist(phrases);
+
+  // Expect keys for lengths: 1, 2, 3
+  Assert.ok(sets.has(1), "Should have set for single-token phrases");
+  Assert.ok(sets.has(2), "Should have set for two-token phrases");
+  Assert.ok(sets.has(3), "Should have set for three-token phrases");
+
+  // 1-token set contains: "sup", "___"
+  Assert.ok(sets.get(1).has("sup"), "Single-token set should contain 'sup'");
+  Assert.ok(sets.get(1).has("___"), "Single-token set should contain '___'");
+
+  // 2-token set contains normalized "hi there" and "foo bar"
+  Assert.ok(
+    sets.get(2).has("hi there"),
+    "Two-token set should contain 'hi there'"
+  );
+  Assert.ok(
+    sets.get(2).has("foo bar"),
+    "Two-token set should contain normalized 'foo bar'"
+  );
+
+  // 3-token set contains "what s up" (note apostrophe split)
+  Assert.ok(
+    sets.get(3).has("what s up"),
+    "Three-token set should contain 'what s up'"
+  );
+
+  // Empty phrase skipped: nothing added for length 0
+  for (const [k, set] of sets) {
+    Assert.ok(
+      k > 0 && set.size >= 1,
+      "No empty keys, each set has at least one entry"
+    );
+  }
+});
+
+add_task(function test_isolated_phrase_checker_single_word_boundaries() {
+  const phrases = ["sup", "hello", "___"];
+  const isForced = makeIsolatedPhraseChecker(phrases);
+
+  // Positive: exact token present
+  Assert.ok(
+    isForced("sup bro"),
+    "Should match 'sup' as an isolated token at start"
+  );
+  Assert.ok(
+    isForced("hey, hello there"),
+    "Should match 'hello' surrounded by punctuation"
+  );
+  Assert.ok(isForced("foo ___ bar"), "Should match token with underscores");
+
+  // Negative: partial-word should NOT match
+  Assert.ok(
+    !isForced("supposingly, this should not match"),
+    "No partial-word match for 'sup'"
+  );
+  Assert.ok(!isForced("supper time"), "No partial-word match inside 'supper'");
+  Assert.ok(!isForced("shelloworld"), "No partial-word match for 'hello'");
+});
+
+add_task(function test_isolated_phrase_checker_multiword_and_punctuation() {
+  // Multiword phrases; apostrophes become token splits -> "what's up" => "what s up"
+  const phrases = ["hi there", "what's up"];
+  const isForced = makeIsolatedPhraseChecker(phrases);
+
+  // Positive: punctuation between words should still match (token split)
+  Assert.ok(
+    isForced("hi—there!"),
+    "Em dash between words should match 'hi there'"
+  );
+  Assert.ok(
+    isForced("well, hi there!!"),
+    "Punctuation around phrase should match"
+  );
+  Assert.ok(
+    isForced("so, what’s up today?"),
+    "Curly apostrophe splits to tokens; should match 'what s up'"
+  );
+
+  // Negative: glued words should not match
+  Assert.ok(
+    !isForced("hithere"),
+    "Concatenated words should not match 'hi there'"
+  );
+  Assert.ok(
+    !isForced("whatssup"),
+    "Should not match 'what s up' without separators"
+  );
+});
+
+add_task(function test_isolated_phrase_checker_spacing_and_unicode_norm() {
+  const phrases = ["good morning", "hello"];
+  const isForced = makeIsolatedPhraseChecker(phrases);
+
+  // Multiple spaces collapse
+  Assert.ok(
+    isForced("good     morning everyone"),
+    "Multiple spaces between tokens should still match"
+  );
+
+  // Fullwidth / NFKC normalization (ＴＥＳＴ) and basic usage
+  Assert.ok(
+    isForced("  HELLO  "),
+    "Case and surrounding spaces should normalize and match 'hello'"
+  );
+
+  // Non-breaking spaces and tabs
+  Assert.ok(
+    isForced("good\u00A0morning\tteam"),
+    "NBSP and tabs normalize and match"
+  );
+});
+
+add_task(function test_isolated_phrase_checker_no_match_cases() {
+  const phrases = ["hi there", "sup"];
+  const isForced = makeIsolatedPhraseChecker(phrases);
+
+  Assert.ok(!isForced(""), "Empty string should not match");
+  Assert.ok(
+    !isForced("nothing to see here"),
+    "Unrelated text should not match"
+  );
+  Assert.ok(
+    !isForced("support"),
+    "Partial token with 'sup' prefix should not match"
+  );
+});
+
+add_task(function test_isolated_phrase_checker_caching_stability() {
+  const phrases = ["hello", "hi there"];
+  const isForced = makeIsolatedPhraseChecker(phrases);
+
+  // Repeated calls with the same input should return identical results (cache sanity)
+  const q1 = "Hello there!";
+  const first = isForced(q1);
+  const second = isForced(q1);
+  Assert.equal(
+    first,
+    second,
+    "Same query should yield identical result across calls (cache-stable)"
+  );
+
+  // Different whitespace should normalize to the same outcome
+  Assert.equal(
+    isForced("  hello   there "),
+    isForced("hello there"),
+    "Whitespace variations should not affect result"
+  );
+});
diff --git a/browser/components/aiwindow/models/tests/xpcshell/xpcshell.toml b/browser/components/aiwindow/models/tests/xpcshell/xpcshell.toml
@@ -0,0 +1,5 @@
+[DEFAULT]
+run-if = ["os != 'android'"]
+firefox-appdir = "browser"
+
+["test_intent_classifier.js"]

	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE

A	browser/components/aiwindow/models/IntentClassifier.sys.mjs	\|	235	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	browser/components/aiwindow/models/moz.build	\|	6	++++++
A	browser/components/aiwindow/models/tests/xpcshell/test_intent_classifier.js	\|	303	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	browser/components/aiwindow/models/tests/xpcshell/xpcshell.toml	\|	5	+++++