tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

commit 22284e85a1de490c37bb63364be4a3cb03c6657a
parent a3565941a5eee04c452674570e64ab55274ab869
Author: Tzu-An Liu <tliu@mozilla.com>
Date:   Fri, 21 Nov 2025 20:39:03 +0000

Bug 2000945 - Move query intent detection to AI-window r=Mardak,ai-models-reviewers

Differential Revision: https://phabricator.services.mozilla.com/D273259

Diffstat:
Abrowser/components/aiwindow/models/IntentClassifier.sys.mjs | 235+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mbrowser/components/aiwindow/models/moz.build | 6++++++
Abrowser/components/aiwindow/models/tests/xpcshell/test_intent_classifier.js | 303+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Abrowser/components/aiwindow/models/tests/xpcshell/xpcshell.toml | 5+++++
4 files changed, 549 insertions(+), 0 deletions(-)

diff --git a/browser/components/aiwindow/models/IntentClassifier.sys.mjs b/browser/components/aiwindow/models/IntentClassifier.sys.mjs @@ -0,0 +1,235 @@ +/** + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +import { createEngine } from "chrome://global/content/ml/EngineProcess.sys.mjs"; + +const FORCED_CHAT_PHRASES = [ + "amuse me", + "are we alone", + "are you alive", + "are you gpt", + "are you human", + "are you real", + "bark like dog", + "cheer me up", + "comfort me", + "count numbers", + "curse me", + "do aliens exist", + "do we matter", + "do you dream", + "do you think", + "does fate exist", + "dream meaning", + "drop wisdom", + "encourage me", + "entertain me", + "explain yourself", + "flip coin", + "give blessing", + "give wisdom", + "good morning", + "good night", + "guess number", + "hallo", + "hello", + "hey", + "hi", + "hola", + "how are you", + "inspire me", + "invent a word", + "invent holiday", + "invent joke", + "is god real", + "life advice", + "life purpose", + "list animals", + "list capitals", + "list colors", + "list countries", + "list elements", + "list fruits", + "list metals", + "list oceans", + "list planets", + "list shapes", + "meaning of life", + "meow like cat", + "motivate me", + "now you are", + "play a game", + "pretend alien", + "pretend child", + "pretend detective", + "pretend ghost", + "pretend pirate", + "pretend robot", + "pretend superhero", + "pretend teacher", + "pretend wizard", + "random fact", + "random number", + "roll dice", + "goodbye", + "simulate chat", + "simulate future", + "simulate past", + "sing like robot", + "sing lullaby", + "sing rap", + "sup", + "surprise me", + "teach me", + "tell bedtime story", + "tell fortune", + "tell joke", + "tell prophecy", + "tell riddle", + "tell story", + "what is art", + "what is beauty", + "what is death", + "what is freedom", + "what is justice", + "what is love", + "what is mind", + "what is reality", + "what is right", + "what is self", + "what is soul", + "what is time", + "what is truth", + "what is wrong", + "what model are you", + "what version", + "what’s up", + "which model are you", + "who am i", + "who are you", + "who made you", + "why are we", + "write a poem", + "write a song", + "write haiku", + "write quote", + "your model is", +]; + +export function normalizeTextForChatAllowlist(s) { + return s.toLowerCase().normalize("NFKC").replace(/\s+/g, " ").trim(); +} + +// Split on non-word chars; letters/numbers/_ are "word" characters +export function tokenizeTextForChatAllowlist(s) { + return normalizeTextForChatAllowlist(s) + .split(/[^\p{L}\p{N}_]+/u) + .filter(Boolean); +} + +export function buildChatAllowlist(phrases) { + const byLen = new Map(); // len -> Set("tok tok ...") + for (const p of phrases) { + const key = tokenizeTextForChatAllowlist(p).join(" "); + if (!key) { + continue; + } + const k = key.split(" ").length; + if (!byLen.has(k)) { + byLen.set(k, new Set()); + } + byLen.get(k).add(key); + } + return byLen; +} + +// Factory: returns a fast checker for “does query contain any isolated phrase?” +export function makeIsolatedPhraseChecker(phrases) { + const byLen = buildChatAllowlist(phrases); + const cache = new Map(); + + return function containsIsolatedPhrase(query) { + const qNorm = normalizeTextForChatAllowlist(query); + if (cache.has(qNorm)) { + return cache.get(qNorm); + } + + const toks = qNorm.split(/[^\p{L}\p{N}_]+/u).filter(Boolean); + for (const [k, set] of byLen) { + for (let i = 0; i + k <= toks.length; i++) { + if (set.has(toks.slice(i, i + k).join(" "))) { + cache.set(qNorm, true); + return true; + } + } + } + cache.set(qNorm, false); + return false; + }; +} + +/** + * Intent Classifier Engine + */ +export const IntentClassifier = { + /** + * Exposing createEngine for testing purposes. + */ + + _createEngine: createEngine, + + /** + * Initialize forced-chat checker at module load. + * Keeping it as a property ensures easy stubbing in tests. + */ + + _isForcedChat: makeIsolatedPhraseChecker(FORCED_CHAT_PHRASES), + + /** + * Gets the intent of the prompt using a text classification model. + * + * @param {string} prompt + * @returns {string} "search" | "chat" + */ + + async getPromptIntent(query) { + try { + const cleanedQuery = this._preprocessQuery(query); + if (this._isForcedChat(cleanedQuery)) { + return "chat"; + } + const engine = await this._createEngine({ + featureId: "smart-intent", + modelId: "mozilla/mobilebert-query-intent-detection", + modelRevision: "v0.2.0", + taskName: "text-classification", + }); + const threshold = 0.8; + const resp = await engine.run({ args: [[cleanedQuery]] }); + // resp example: [{ label: "chat", score: 0.95 }, { label: "search", score: 0.04 }] + if ( + resp[0].label.toLowerCase() === "chat" && + resp[0].score >= threshold + ) { + return "chat"; + } + return "search"; + } catch (error) { + console.error("Error using intent detection model:", error); + throw error; + } + }, + + // Helper function for preprocessing text input + _preprocessQuery(query) { + if (typeof query !== "string") { + throw new TypeError( + `Expected a string for query preprocessing, but received ${typeof query}` + ); + } + return query.replace(/\?/g, "").trim(); + }, +}; diff --git a/browser/components/aiwindow/models/moz.build b/browser/components/aiwindow/models/moz.build @@ -4,3 +4,9 @@ with Files("**"): BUG_COMPONENT = ("Core", "Machine Learning: General") + +MOZ_SRC_FILES += [ + "IntentClassifier.sys.mjs", +] + +XPCSHELL_TESTS_MANIFESTS += ["tests/xpcshell/xpcshell.toml"] diff --git a/browser/components/aiwindow/models/tests/xpcshell/test_intent_classifier.js b/browser/components/aiwindow/models/tests/xpcshell/test_intent_classifier.js @@ -0,0 +1,303 @@ +/* Any copyright is dedicated to the Public Domain. + * http://creativecommons.org/publicdomain/zero/1.0/ */ + +const { + IntentClassifier, + normalizeTextForChatAllowlist, + tokenizeTextForChatAllowlist, + buildChatAllowlist, + makeIsolatedPhraseChecker, +} = ChromeUtils.importESModule( + "moz-src:///browser/components/aiwindow/models/IntentClassifier.sys.mjs" +); + +const { sinon } = ChromeUtils.importESModule( + "resource://testing-common/Sinon.sys.mjs" +); + +add_task(async function test_getPromptIntent_basic() { + const sb = sinon.createSandbox(); + try { + const cases = [ + { prompt: "please search for news on firefox", expected: "search" }, + { + prompt: "Can you FIND me the docs for PageAssist?", + expected: "search", + }, // case-insensitive + { prompt: "look up the best pizza in SF", expected: "search" }, + { prompt: "hello there, how are you?", expected: "chat" }, + { prompt: "tell me a joke", expected: "chat" }, + ]; + + const fakeEngine = { + run({ args: [[query]] }) { + const searchKeywords = [ + "search", + "find", + "look", + "query", + "locate", + "explore", + ]; + const formattedPrompt = query.toLowerCase(); + const isSearch = searchKeywords.some(keyword => + formattedPrompt.includes(keyword) + ); + + // Simulate model confidence scores + if (isSearch) { + return [ + { label: "search", score: 0.95 }, + { label: "chat", score: 0.05 }, + ]; + } + return [ + { label: "chat", score: 0.95 }, + { label: "search", score: 0.05 }, + ]; + }, + }; + + sb.stub(IntentClassifier, "_createEngine").resolves(fakeEngine); + + for (const { prompt, expected } of cases) { + const intent = await IntentClassifier.getPromptIntent(prompt); + Assert.equal( + intent, + expected, + `getPromptIntent("${prompt}") should return "${expected}"` + ); + } + } finally { + sb.restore(); + } +}); + +add_task(async function test_preprocessQuery_removes_question_marks() { + // Call the real helper on the classifier + const cases = [ + { input: "hello?", expected: "hello" }, + { input: "?prompt", expected: "prompt" }, + { input: "multiple???", expected: "multiple" }, + { input: "mid?dle", expected: "middle" }, + { input: "question? ", expected: "question" }, + { input: " no? spaces? ", expected: "no spaces" }, + { input: "???", expected: "" }, + { input: "clean input", expected: "clean input" }, + ]; + + for (const { input, expected } of cases) { + const result = IntentClassifier._preprocessQuery(input); + Assert.equal( + result, + expected, + `Expected "${input}" to preprocess to "${expected}", got "${result}"` + ); + } +}); + +add_task(function test_normalizeTextForChatAllowlist_basic() { + // lowercasing + trimming + collapsing internal spaces + Assert.equal( + normalizeTextForChatAllowlist(" HeLLo There "), + "hello there", + "Should lowercase, trim, and collapse spaces" + ); + + // NFKC normalization: compatibility forms → canonical + // Fullwidth characters normalize: e.g., 'TEST' → 'test' + Assert.equal( + normalizeTextForChatAllowlist("TEST 123"), + "test 123", + "Should NFKC-normalize fullwidth letters/digits" + ); + + // Multiple whitespace kinds (NBSP, tabs, newlines) collapse + Assert.equal( + normalizeTextForChatAllowlist("a\u00A0b\tc\nd"), + "a b c d", + "Should collapse all whitespace kinds to single spaces" + ); +}); + +add_task(function test_tokenizeTextForChatAllowlist_unicode_and_boundaries() { + // Splits on non-word chars, keeps letters/digits/underscore + Assert.deepEqual( + tokenizeTextForChatAllowlist("hello, world! 42_times"), + ["hello", "world", "42_times"], + "Should split on punctuation and keep underscores" + ); + + // Unicode letters should be treated as word chars (\p{L}) + Assert.deepEqual( + tokenizeTextForChatAllowlist("mañana—café!"), + ["mañana", "café"], + "Should keep Unicode letters and split on punctuation (em dash, bang)" + ); + + // Apostrophes split (non-word), as intended + Assert.deepEqual( + tokenizeTextForChatAllowlist("what's up"), + ["what", "s", "up"], + "Apostrophes are separators, so tokens split around them" + ); +}); + +add_task(function test_buildChatAllowlist_grouping_and_normalization() { + const phrases = [ + "sup", + "hi there", // 2 tokens + "what's up", // becomes "what s up" (3 tokens) + " foo bar ", // leading/trailing + multiple spaces + "", // empty should be skipped + "___", // token of underscores counts as 1 token + ]; + const sets = buildChatAllowlist(phrases); + + // Expect keys for lengths: 1, 2, 3 + Assert.ok(sets.has(1), "Should have set for single-token phrases"); + Assert.ok(sets.has(2), "Should have set for two-token phrases"); + Assert.ok(sets.has(3), "Should have set for three-token phrases"); + + // 1-token set contains: "sup", "___" + Assert.ok(sets.get(1).has("sup"), "Single-token set should contain 'sup'"); + Assert.ok(sets.get(1).has("___"), "Single-token set should contain '___'"); + + // 2-token set contains normalized "hi there" and "foo bar" + Assert.ok( + sets.get(2).has("hi there"), + "Two-token set should contain 'hi there'" + ); + Assert.ok( + sets.get(2).has("foo bar"), + "Two-token set should contain normalized 'foo bar'" + ); + + // 3-token set contains "what s up" (note apostrophe split) + Assert.ok( + sets.get(3).has("what s up"), + "Three-token set should contain 'what s up'" + ); + + // Empty phrase skipped: nothing added for length 0 + for (const [k, set] of sets) { + Assert.ok( + k > 0 && set.size >= 1, + "No empty keys, each set has at least one entry" + ); + } +}); + +add_task(function test_isolated_phrase_checker_single_word_boundaries() { + const phrases = ["sup", "hello", "___"]; + const isForced = makeIsolatedPhraseChecker(phrases); + + // Positive: exact token present + Assert.ok( + isForced("sup bro"), + "Should match 'sup' as an isolated token at start" + ); + Assert.ok( + isForced("hey, hello there"), + "Should match 'hello' surrounded by punctuation" + ); + Assert.ok(isForced("foo ___ bar"), "Should match token with underscores"); + + // Negative: partial-word should NOT match + Assert.ok( + !isForced("supposingly, this should not match"), + "No partial-word match for 'sup'" + ); + Assert.ok(!isForced("supper time"), "No partial-word match inside 'supper'"); + Assert.ok(!isForced("shelloworld"), "No partial-word match for 'hello'"); +}); + +add_task(function test_isolated_phrase_checker_multiword_and_punctuation() { + // Multiword phrases; apostrophes become token splits -> "what's up" => "what s up" + const phrases = ["hi there", "what's up"]; + const isForced = makeIsolatedPhraseChecker(phrases); + + // Positive: punctuation between words should still match (token split) + Assert.ok( + isForced("hi—there!"), + "Em dash between words should match 'hi there'" + ); + Assert.ok( + isForced("well, hi there!!"), + "Punctuation around phrase should match" + ); + Assert.ok( + isForced("so, what’s up today?"), + "Curly apostrophe splits to tokens; should match 'what s up'" + ); + + // Negative: glued words should not match + Assert.ok( + !isForced("hithere"), + "Concatenated words should not match 'hi there'" + ); + Assert.ok( + !isForced("whatssup"), + "Should not match 'what s up' without separators" + ); +}); + +add_task(function test_isolated_phrase_checker_spacing_and_unicode_norm() { + const phrases = ["good morning", "hello"]; + const isForced = makeIsolatedPhraseChecker(phrases); + + // Multiple spaces collapse + Assert.ok( + isForced("good morning everyone"), + "Multiple spaces between tokens should still match" + ); + + // Fullwidth / NFKC normalization (TEST) and basic usage + Assert.ok( + isForced(" HELLO "), + "Case and surrounding spaces should normalize and match 'hello'" + ); + + // Non-breaking spaces and tabs + Assert.ok( + isForced("good\u00A0morning\tteam"), + "NBSP and tabs normalize and match" + ); +}); + +add_task(function test_isolated_phrase_checker_no_match_cases() { + const phrases = ["hi there", "sup"]; + const isForced = makeIsolatedPhraseChecker(phrases); + + Assert.ok(!isForced(""), "Empty string should not match"); + Assert.ok( + !isForced("nothing to see here"), + "Unrelated text should not match" + ); + Assert.ok( + !isForced("support"), + "Partial token with 'sup' prefix should not match" + ); +}); + +add_task(function test_isolated_phrase_checker_caching_stability() { + const phrases = ["hello", "hi there"]; + const isForced = makeIsolatedPhraseChecker(phrases); + + // Repeated calls with the same input should return identical results (cache sanity) + const q1 = "Hello there!"; + const first = isForced(q1); + const second = isForced(q1); + Assert.equal( + first, + second, + "Same query should yield identical result across calls (cache-stable)" + ); + + // Different whitespace should normalize to the same outcome + Assert.equal( + isForced(" hello there "), + isForced("hello there"), + "Whitespace variations should not affect result" + ); +}); diff --git a/browser/components/aiwindow/models/tests/xpcshell/xpcshell.toml b/browser/components/aiwindow/models/tests/xpcshell/xpcshell.toml @@ -0,0 +1,5 @@ +[DEFAULT] +run-if = ["os != 'android'"] +firefox-appdir = "browser" + +["test_intent_classifier.js"]