tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

commit 4c51134ff3ccd71bd04ccf170fcd99434657de9d
parent 2c0430bd746ec5fca77f77564b6fb23b52355c31
Author: Vasish Baungally <vbaungally@mozilla.com>
Date:   Thu,  4 Dec 2025 21:20:03 +0000

Bug 2003577 - Add Domain Handling for Smart Tab Grouping. r=tarek,tabbrowser-reviewers,ai-ondevice-reviewers,sthompson

The Logistic Regression approach has a 20% improvement over the existing Nearest Neighbor implementation on our evaluation dataset (on macro F1). We'll start an experiment evaluating these two approaches soon after this patch lands.

Differential Revision: https://phabricator.services.mozilla.com/D274984

Diffstat:
Mbrowser/components/tabbrowser/SmartTabGrouping.sys.mjs | 213+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
Mbrowser/components/tabbrowser/test/xpcshell/smarttabgrouping/test_calculate_probability.js | 27++++++++++++++++++++++++---
Abrowser/components/tabbrowser/test/xpcshell/smarttabgrouping/test_logistic_regression_utils.js | 225+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mbrowser/components/tabbrowser/test/xpcshell/smarttabgrouping/xpcshell.toml | 2++
Mtoolkit/components/ml/tests/browser/browser_ml_smart_tab_clustering_perf.js | 211++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
5 files changed, 626 insertions(+), 52 deletions(-)

diff --git a/browser/components/tabbrowser/SmartTabGrouping.sys.mjs b/browser/components/tabbrowser/SmartTabGrouping.sys.mjs @@ -84,7 +84,7 @@ const MAX_NON_SUMMARIZED_SEARCH_LENGTH = 26; export const DIM_REDUCTION_METHODS = {}; const MISSING_ANCHOR_IN_CLUSTER_PENALTY = 0.2; -const MAX_NN_GROUPED_TABS = 4; +const MAX_NN_GROUPED_TABS = 3; const MAX_SUGGESTED_TABS = 10; const DISSIMILAR_TAB_LABEL = "none"; @@ -135,19 +135,25 @@ export const SMART_TAB_GROUPING_CONFIG = { // these parameters were generated by training a logistic regression // model on synthetic data. see https://github.com/mozilla/smart-tab-grouping -// for more info +// and https://github.com/mozilla/smart-tab-grouping/pull/12 for more info const LOGISTIC_REGRESSION_PARAMS = { + // Logistic WITH group name + // Features: s_gc, s_tt_max, s_dd in [0, 1] TITLE_WITH_GROUP_NAME: { - GROUP_SIMILARITY_WEIGHT: 6.76420017, - TITLE_SIMILARITY_WEIGHT: 2.95779555, - INTERCEPT: -3.06862155, - THRESHOLD: 0.45, + GROUP_SIMILARITY_WEIGHT: 0.10249, + TITLE_SIMILARITY_WEIGHT: 0.54897, + DOMAIN_SIMILARITY_WEIGHT: 0.34854, + INTERCEPT: -0.07397, + THRESHOLD: 0.59, }, + // Logistic WITHOUT group name + // Features: s_tt_max, s_dd in [0, 1] TITLE_ONLY: { - GROUP_SIMILARITY_WEIGHT: 0, - TITLE_SIMILARITY_WEIGHT: 2.50596721, - INTERCEPT: -0.54293376, - THRESHOLD: 0.6, + GROUP_SIMILARITY_WEIGHT: 0, // unused in this variant + TITLE_SIMILARITY_WEIGHT: 0.92513, + DOMAIN_SIMILARITY_WEIGHT: 0.07487, + INTERCEPT: -2.58574, + THRESHOLD: 0.123, }, }; @@ -442,8 +448,8 @@ export class SmartTabGroupingManager { /** * Calculates the average similarity between the anchor embeddings and the candidate embeddings * - * @param {list[Number]} anchorEmbeddings title embeddings for the anchor tabs - * @param {list[Number]} candidateEmbeddings title embeddings for the candidate tabs + * @param {number[]} anchorEmbeddings title embeddings for the anchor tabs + * @param {number[]} candidateEmbeddings title embeddings for the candidate tabs */ getAverageSimilarity(anchorEmbeddings, candidateEmbeddings) { let averageSimilarities = []; @@ -458,6 +464,96 @@ export class SmartTabGroupingManager { } /** + * Calculates the max similarity between the anchor embeddings and the candidate embeddings + * (used for s_tt_max). + * + * @param {number[]} anchorEmbeddings title embeddings for the anchor tabs + * @param {number[]} candidateEmbeddings title embeddings for the candidate tabs + */ + getMaxSimilarity(anchorEmbeddings, candidateEmbeddings) { + let maxSimilarities = []; + for (let candidate_embedding of candidateEmbeddings) { + let maxSimilarity = -1; + for (let anchor_embedding of anchorEmbeddings) { + const sim = cosSim(candidate_embedding, anchor_embedding); + if (sim > maxSimilarity) { + maxSimilarity = sim; + } + } + maxSimilarities.push(maxSimilarity); + } + return maxSimilarities; + } + + /** + * Extract base domain from a URL with error handling + * + * @param {string} url + * @return {string} + */ + static getBaseDomain(url) { + if (!url) { + return ""; + } + + let hostname; + try { + ({ hostname } = new URL(url)); + } catch (_e) { + // invalid URL + return ""; + } + + if (!hostname) { + return ""; + } + + try { + // additionalParts = 1 → one label above the registrable domain + // then remove 'www' + // https://www.example.com -> www.example.com -> example.com + // https://www.docs.google.com -> docs.google.com + // https://localhost -> error + return Services.eTLD + .getBaseDomain(Services.io.newURI(url.toLowerCase()), 1) + .replace(/^www\./, ""); + } catch (_e) { + // localhost, IPs, internal hosts, etc. + // bucket by the hostname. + return hostname.toLowerCase(); + } + } + + /** + * For each candidate tab, compute s_dd = fraction of anchors whose base domain + * matches the candidate's base domain. + * + * @param {Array} anchorTabsPrep output of _prepareTabData for anchor tabs + * @param {Array} candidateTabsPrep output of _prepareTabData for candidate tabs + * @return {number[]} array of s_dd values in [0, 1] + */ + getDomainMatchFractions(anchorTabsPrep, candidateTabsPrep) { + const anchorDomains = anchorTabsPrep.map(t => + SmartTabGroupingManager.getBaseDomain(t.url) + ); + const numAnchors = anchorDomains.length || 1; + + return candidateTabsPrep.map(tab => { + const candDomain = SmartTabGroupingManager.getBaseDomain(tab.url); + if (!candDomain) { + return 0; + } + let same = 0; + for (const ad of anchorDomains) { + if (ad && ad === candDomain) { + same++; + } + } + return same / numAnchors; + }); + } + + /** * Calculates the sigmoid value of the input * * @param {number} z @@ -470,38 +566,62 @@ export class SmartTabGroupingManager { /** * Calculates the probability using the linear combination of the parameters * - * @param {number} groupSimilarity how similar a candidate tab is to the group name - * @param {number} titleSimilarity how similar a candidate tab is to the anchors + * @param {number} groupSimilarity s_gc in [0,1] + * @param {number} titleSimilarity s_tt_max in [0,1] + * @param {number} domainSimilarity s_dd in [0,1] * @param {object} params the logistic regression weights assigned to each parameter * @return {number} */ - calculateProbability(groupSimilarity, titleSimilarity, params) { - return this.sigmoid( - groupSimilarity * params.GROUP_SIMILARITY_WEIGHT + - titleSimilarity * params.TITLE_SIMILARITY_WEIGHT + - params.INTERCEPT - ); + calculateProbability( + groupSimilarity, + titleSimilarity, + domainSimilarity, + params + ) { + const wGroup = params.GROUP_SIMILARITY_WEIGHT || 0; + const wTitle = params.TITLE_SIMILARITY_WEIGHT || 0; + const wDomain = params.DOMAIN_SIMILARITY_WEIGHT || 0; + const z = + groupSimilarity * wGroup + + titleSimilarity * wTitle + + domainSimilarity * wDomain + + params.INTERCEPT; + return this.sigmoid(z); } /** - * Calculates the probabilities given two lists of the same length + * Calculates the probabilities given similarity lists (cosine) and domain fractions. * - * @param {list[Number]} groupSimilarities cosine similarity between the candidate tabs and the group name - * @param {list[Number]} titleSimilarities average cosine similarity between the candidate tabs and anchors - * @return {list[Number]} probabilities for each candidate tab + * @param {number[]|null} groupSimilaritiesCos cosine(group, candidate) in [-1,1] or null + * @param {number[]} titleSimilaritiesCos max cosine(anchor, candidate) in [-1,1] + * @param {number[]} domainSimilarities s_dd in [0,1] + * @return {number[]} probabilities for each candidate tab */ - calculateAllProbabilities(groupSimilarities, titleSimilarities) { - const hasGroupSimilarity = Boolean(groupSimilarities); - let probabilities = []; - for (let i = 0; i < titleSimilarities.length; i++) { + calculateAllProbabilities( + groupSimilaritiesCos, + titleSimilaritiesCos, + domainSimilarities + ) { + const hasGroupSimilarity = + Array.isArray(groupSimilaritiesCos) && groupSimilaritiesCos.length; + const useDomain = + Array.isArray(domainSimilarities) && domainSimilarities.length; + + const probabilities = []; + for (let i = 0; i < titleSimilaritiesCos.length; i++) { + // groupTitleSim and titleSim are (cos + 1)/2 -> [0,1] + const groupTitleSim = hasGroupSimilarity + ? 0.5 * (groupSimilaritiesCos[i] + 1) + : 0; + const titleSim = 0.5 * (titleSimilaritiesCos[i] + 1); + const domainSim = useDomain ? domainSimilarities[i] : 0; + + const params = hasGroupSimilarity + ? LOGISTIC_REGRESSION_PARAMS.TITLE_WITH_GROUP_NAME + : LOGISTIC_REGRESSION_PARAMS.TITLE_ONLY; + probabilities.push( - this.calculateProbability( - hasGroupSimilarity ? groupSimilarities[i] : 0, - titleSimilarities[i], - hasGroupSimilarity - ? LOGISTIC_REGRESSION_PARAMS.TITLE_WITH_GROUP_NAME - : LOGISTIC_REGRESSION_PARAMS.TITLE_ONLY - ) + this.calculateProbability(groupTitleSim, titleSim, domainSim, params) ); } return probabilities; @@ -543,28 +663,35 @@ export class SmartTabGroupingManager { ); let groupEmbedding; - let groupSimilarities; + let groupSimilaritiesCos = null; if (groupLabel) { groupEmbedding = await this._generateEmbeddings([groupLabel]); - // calculate similarity between the group and the candidate tabs if group name is present - groupSimilarities = this.getAverageSimilarity( + // cosine(group, candidate_title) in [-1,1] + groupSimilaritiesCos = this.getAverageSimilarity( groupEmbedding, titleEmbeddings.slice(anchorTabsPrep.length) ); } - // calculate the similarity between the anchors and candidate titles - const titleSimilarities = this.getAverageSimilarity( + // s_tt_max: max cosine(anchor_title, candidate_title) in [-1,1] + const titleSimilaritiesCos = this.getMaxSimilarity( titleEmbeddings.slice(0, anchorTabsPrep.length), titleEmbeddings.slice(anchorTabsPrep.length) ); + // s_dd: fraction of anchors sharing the candidate's base domain + const domainSimilarities = this.getDomainMatchFractions( + anchorTabsPrep, + candidateTabsPrep + ); + const candidateProbabilities = this.calculateAllProbabilities( - groupSimilarities, - titleSimilarities + groupSimilaritiesCos, + titleSimilaritiesCos, + domainSimilarities ); - // get proper params depending on group name availability + // get matching params depending on the group name availability const probabilityThreshold = groupEmbedding ? LOGISTIC_REGRESSION_PARAMS.TITLE_WITH_GROUP_NAME.THRESHOLD : LOGISTIC_REGRESSION_PARAMS.TITLE_ONLY.THRESHOLD; diff --git a/browser/components/tabbrowser/test/xpcshell/smarttabgrouping/test_calculate_probability.js b/browser/components/tabbrowser/test/xpcshell/smarttabgrouping/test_calculate_probability.js @@ -10,13 +10,16 @@ add_task(function test_calculate_probability_zero_inputs() { const params = { GROUP_SIMILARITY_WEIGHT: 1, TITLE_SIMILARITY_WEIGHT: 1, + DOMAIN_SIMILARITY_WEIGHT: 1, INTERCEPT: 0, }; const groupSim = 0; const titleSim = 0; + const domainSim = 0; const result = smartTabGroupingManager.calculateProbability( groupSim, titleSim, + domainSim, params ); const expected = 1 / (1 + Math.exp(0)); // sigmoid(0) = 0.5 @@ -32,16 +35,19 @@ add_task(function test_calculate_probability_both_positive() { const params = { GROUP_SIMILARITY_WEIGHT: 1, TITLE_SIMILARITY_WEIGHT: 1, + DOMAIN_SIMILARITY_WEIGHT: 1, INTERCEPT: 0, }; const groupSim = 1; const titleSim = 1; + const domainSim = 1; const result = smartTabGroupingManager.calculateProbability( groupSim, titleSim, + domainSim, params ); - const expected = 1 / (1 + Math.exp(-2)); + const expected = 1 / (1 + Math.exp(-3)); Assert.equal( result.toPrecision(4), expected.toPrecision(4), @@ -54,13 +60,16 @@ add_task(function test_calculate_probability_mixed_values() { const params = { GROUP_SIMILARITY_WEIGHT: 2, TITLE_SIMILARITY_WEIGHT: 3, + DOMAIN_SIMILARITY_WEIGHT: 0, INTERCEPT: 0.5, }; const groupSim = 1; const titleSim = -1; + const domainSim = -1; const result = smartTabGroupingManager.calculateProbability( groupSim, titleSim, + domainSim, params ); const expected = 1 / (1 + Math.exp(0.5)); // sigmoid(-0.5) @@ -76,13 +85,16 @@ add_task(function test_calculate_probability_zero_weights() { const params = { GROUP_SIMILARITY_WEIGHT: 0, TITLE_SIMILARITY_WEIGHT: 0, + DOMAIN_SIMILARITY_WEIGHT: 0, INTERCEPT: 0, }; const groupSim = 5; const titleSim = -3; + const domainSim = 1; const result = smartTabGroupingManager.calculateProbability( groupSim, titleSim, + domainSim, params ); const expected = 1 / (1 + Math.exp(0)); // sigmoid(0) = 0.5 @@ -98,16 +110,19 @@ add_task(function test_calculate_probability_extreme_positive() { const params = { GROUP_SIMILARITY_WEIGHT: 1, TITLE_SIMILARITY_WEIGHT: 1, + DOMAIN_SIMILARITY_WEIGHT: 1, INTERCEPT: 0, }; const groupSim = 10; const titleSim = 10; + const domainSim = 10; const result = smartTabGroupingManager.calculateProbability( groupSim, titleSim, + domainSim, params ); - const expected = 1 / (1 + Math.exp(-20)); + const expected = 1 / (1 + Math.exp(-30)); Assert.equal( result.toPrecision(4), expected.toPrecision(4), @@ -120,16 +135,19 @@ add_task(function test_calculate_probability_extreme_negative() { const params = { GROUP_SIMILARITY_WEIGHT: 1, TITLE_SIMILARITY_WEIGHT: 1, + DOMAIN_SIMILARITY_WEIGHT: 1, INTERCEPT: 0, }; const groupSim = -10; const titleSim = -10; + const domainSim = -10; const result = smartTabGroupingManager.calculateProbability( groupSim, titleSim, + domainSim, params ); - const expected = 1 / (1 + Math.exp(20)); + const expected = 1 / (1 + Math.exp(30)); Assert.equal( result.toPrecision(4), expected.toPrecision(4), @@ -142,13 +160,16 @@ add_task(function test_calculate_probability_negative_intercept() { const params = { GROUP_SIMILARITY_WEIGHT: 1, TITLE_SIMILARITY_WEIGHT: 1, + DOMAIN_SIMILARITY_WEIGHT: 0, INTERCEPT: -1, }; const groupSim = 0.5; const titleSim = 0.5; + const domainSim = 0.5; const result = smartTabGroupingManager.calculateProbability( groupSim, titleSim, + domainSim, params ); const expected = 1 / (1 + Math.exp(0)); // sigmoid(0) = 0.5 diff --git a/browser/components/tabbrowser/test/xpcshell/smarttabgrouping/test_logistic_regression_utils.js b/browser/components/tabbrowser/test/xpcshell/smarttabgrouping/test_logistic_regression_utils.js @@ -0,0 +1,225 @@ +/* Any copyright is dedicated to the Public Domain. + * http://creativecommons.org/publicdomain/zero/1.0/ */ + +const { SmartTabGroupingManager } = ChromeUtils.importESModule( + "moz-src:///browser/components/tabbrowser/SmartTabGrouping.sys.mjs" +); + +add_task(function test_logistic_regression_get_base_domain() { + // Basic HTTPS URL with www + Assert.equal( + SmartTabGroupingManager.getBaseDomain("https://www.example.com/path"), + "example.com", + "www.example.com should normalize to example.com" + ); + + // Multiple subdomains + Assert.equal( + SmartTabGroupingManager.getBaseDomain("https://docs.example.com"), + "docs.example.com", + "Should keep last subdomain + baseDomain" + ); + + // Hosted services like blogs + Assert.equal( + SmartTabGroupingManager.getBaseDomain("https://myblog.example.com/"), + "myblog.example.com", + "Should bucket per hosted subdomain (blog, docs, etc.)" + ); + + // Host without dots + Assert.equal( + SmartTabGroupingManager.getBaseDomain("http://localhost"), + "localhost", + "Should return hostname as-is when there is no dot" + ); + + // Invalid / empty URL should be handled gracefully + Assert.equal( + SmartTabGroupingManager.getBaseDomain(""), + "", + "Invalid URL should return empty string" + ); +}); + +add_task(function test_logistic_regression_domain_match_fractions() { + const mgr = new SmartTabGroupingManager(); + + const anchors = [ + { url: "https://a.com/foo" }, + { url: "https://www.a.com/bar" }, + { url: "https://b.com/baz" }, + ]; + const candidates = [ + { url: "https://a.com/other" }, // matches 2 of 3 anchors + { url: "https://b.com/other" }, // matches 1 of 3 anchors + { url: "https://c.com/other" }, // matches 0 of 3 anchors + { url: "" }, // invalid / empty URL + ]; + + const fractions = mgr.getDomainMatchFractions(anchors, candidates); + + Assert.equal( + fractions.length, + candidates.length, + "Should return one value per candidate" + ); + + Assert.less( + Math.abs(fractions[0] - 2 / 3), + 1e-6, + "Candidate with domain matching two of three anchors should have fraction 2/3" + ); + + Assert.less( + Math.abs(fractions[1] - 1 / 3), + 1e-6, + "Candidate with domain matching one of three anchors should have fraction 1/3" + ); + + Assert.equal( + fractions[2], + 0, + "Candidate with domain not matching any anchor should have fraction 0" + ); + + Assert.equal( + fractions[3], + 0, + "Candidate with invalid URL should have fraction 0" + ); +}); + +add_task(function test_logistic_regression_get_max_similarity() { + const mgr = new SmartTabGroupingManager(); + + const anchors = [ + [1, 0], + [0, 1], + ]; + const candidates = [ + [1, 0], // identical to first anchor -> cos ~ 1 + [0.5, 0.5], // at 45 degrees -> cos ~ 0.707 with either anchor + ]; + + const maxSims = mgr.getMaxSimilarity(anchors, candidates); + + Assert.equal( + maxSims.length, + candidates.length, + "Should return one max similarity per candidate" + ); + + Assert.less( + Math.abs(maxSims[0] - 1), + 1e-6, + "First candidate identical to first anchor should have cosine similarity ~1" + ); + + Assert.ok( + maxSims[1] > 0.7 && maxSims[1] < 0.8, + "Second candidate should have cosine similarity ~sqrt(1/2) ≈ 0.707 with at least one anchor" + ); +}); + +add_task(function test_logistic_regression_sigmoid_and_calculate_probability() { + const mgr = new SmartTabGroupingManager(); + + // Basic sigmoid sanity checks + Assert.less(Math.abs(mgr.sigmoid(0) - 0.5), 1e-6, "sigmoid(0) should be 0.5"); + + Assert.greater( + mgr.sigmoid(10), + 0.99, + "sigmoid of large positive number should be close to 1" + ); + + Assert.less( + mgr.sigmoid(-10), + 0.01, + "sigmoid of large negative number should be close to 0" + ); + + // Check that calculateProbability matches explicit linear combination + sigmoid + const params = { + GROUP_SIMILARITY_WEIGHT: 1, + TITLE_SIMILARITY_WEIGHT: 2, + DOMAIN_SIMILARITY_WEIGHT: 3, + INTERCEPT: 0, + }; + + const s_gc = 0.5; + const s_tt = 0.5; + const s_dd = 0.5; + + const prob = mgr.calculateProbability(s_gc, s_tt, s_dd, params); + const expectedZ = s_gc * 1 + s_tt * 2 + s_dd * 3; // 3 + const expectedProb = mgr.sigmoid(expectedZ); + + Assert.less( + Math.abs(prob - expectedProb), + 1e-6, + "calculateProbability should equal sigmoid(linear combination of features and weights)" + ); +}); + +add_task( + function test_logistic_regression_calculate_all_probabilities_with_group() { + const mgr = new SmartTabGroupingManager(); + + // cos = 0 for both candidates -> s_gc = s_tt_max = 0.5 for both + const groupSimilaritiesCos = [0, 0]; + const titleSimilaritiesCos = [0, 0]; + + // Candidate 0 has full domain match, candidate 1 has none. + const domainSimilarities = [1, 0]; + + const probs = mgr.calculateAllProbabilities( + groupSimilaritiesCos, + titleSimilaritiesCos, + domainSimilarities + ); + + Assert.equal( + probs.length, + 2, + "Should return one probability per candidate" + ); + + Assert.greater( + probs[0], + probs[1], + "With group present, candidate with higher domain match fraction should have higher probability" + ); + } +); + +add_task( + function test_logistic_regression_calculate_all_probabilities_without_group() { + const mgr = new SmartTabGroupingManager(); + + // cos = 0 for both candidates -> s_tt_max = 0.5 for both + const titleSimilaritiesCos = [0, 0]; + + // Candidate 0 has full domain match, candidate 1 has none. + const domainSimilarities = [1, 0]; + + const probs = mgr.calculateAllProbabilities( + null, // no group similarities -> TITLE_ONLY params + titleSimilaritiesCos, + domainSimilarities + ); + + Assert.equal( + probs.length, + 2, + "Should return one probability per candidate" + ); + + Assert.greater( + probs[0], + probs[1], + "Without group, candidate with higher domain match fraction should have higher probability" + ); + } +); diff --git a/browser/components/tabbrowser/test/xpcshell/smarttabgrouping/xpcshell.toml b/browser/components/tabbrowser/test/xpcshell/smarttabgrouping/xpcshell.toml @@ -8,4 +8,6 @@ firefox-appdir = "browser" ["test_calculate_probability.js"] +["test_logistic_regression_utils.js"] + ["test_text_preprocessing.js"] diff --git a/toolkit/components/ml/tests/browser/browser_ml_smart_tab_clustering_perf.js b/toolkit/components/ml/tests/browser/browser_ml_smart_tab_clustering_perf.js @@ -111,10 +111,27 @@ async function runTopicModel(texts, keywords = []) { return output.map(o => o.generated_text); } +// build tab object similar to what we'd expect for an actual tab +function makeUrlTab(url, label, { groupId = null } = {}) { + return { + label, + url, + group: groupId, + pinned: false, + linkedBrowser: { + currentURI: { + spec: url, + }, + }, + }; +} + const singleTabMetrics = {}; singleTabMetrics["SINGLE-TAB-LATENCY"] = []; singleTabMetrics["SINGLE-TAB-LOGISTIC-REGRESSION-LATENCY"] = []; singleTabMetrics["SINGLE-TAB-TOPIC-LATENCY"] = []; +// measure latency with domain feature +singleTabMetrics["SINGLE-TAB-LR-WITH-DOMAIN-LATENCY"] = []; add_task(async function test_clustering_nearest_neighbors() { const modelHubRootUrl = Services.env.get("MOZ_MODELS_HUB"); @@ -142,7 +159,7 @@ add_task(async function test_clustering_nearest_neighbors() { groupedIndices: [1], alreadyGroupedIndices: [], groupLabel: "Travel Planning", - thresholdMills: 300, + thresholdMills: 275, }); const endTime = performance.now(); singleTabMetrics["SINGLE-TAB-LATENCY"].push(endTime - startTime); @@ -205,7 +222,7 @@ add_task(async function test_clustering_logistic_regression() { const titles = similarTabs.map(s => s.label); Assert.equal( titles.length, - 5, + 3, "Proper number of similar tabs should be returned" ); Assert.equal( @@ -217,11 +234,193 @@ add_task(async function test_clustering_logistic_regression() { "Impact of Tourism on Local Communities - Google Scholar" ); Assert.equal(titles[2], "Cheap Flights, Airline Tickets & Airfare Deals"); - Assert.equal( - titles[3], - "The Influence of Travel Restrictions on the Spread of COVID-19 - Nature" + generateEmbeddingsStub.restore(); + await EngineProcess.destroyMLEngine(); + await cleanup(); +}); + +// test domain feature for Logistic Regression +add_task( + async function test_clustering_logistic_regression_domain_preference() { + const modelHubRootUrl = Services.env.get("MOZ_MODELS_HUB"); + const { cleanup } = await perfSetup({ + prefs: [["browser.ml.modelHubRootUrl", modelHubRootUrl]], + }); + + const stgManager = new SmartTabGroupingManager(); + + let generateEmbeddingsStub = sinon.stub( + SmartTabGroupingManager.prototype, + "_generateEmbeddings" + ); + generateEmbeddingsStub.callsFake(async textList => { + return await generateEmbeddings(textList); + }); + + const sharedTitle = "Smart Tab Grouping deep dive"; + + const anchor0 = makeUrlTab( + "https://docs.google.com/document/d/1-smart-tab-grouping-deep-dive/edit", + sharedTitle, + { groupId: "stg-group" } + ); + const anchor1 = makeUrlTab( + "https://docs.google.com/document/d/1-smart-tab-grouping-deep-dive-2/edit", + sharedTitle, + { groupId: "stg-group" } + ); + + const candidateSameDomain = makeUrlTab( + "https://docs.google.com/document/d/1-smart-tab-grouping-deep-dive-3/edit", + sharedTitle + ); + const candidateOtherDomain = makeUrlTab( + "https://example.com/smart-tab-grouping-deep-dive-3", + sharedTitle + ); + + const unrelated = makeUrlTab( + "https://www.youtube.com/watch?v=xyz", + "Cute cat compilation 2025" + ); + + const allTabs = [ + anchor0, + anchor1, + candidateSameDomain, + candidateOtherDomain, + unrelated, + ]; + + const groupedIndices = [0, 1]; + const alreadyGroupedIndices = []; + const groupLabel = sharedTitle; + + const startTime = performance.now(); + const similarTabs = await stgManager.findSimilarTabsLogisticRegression({ + allTabs, + groupedIndices, + alreadyGroupedIndices, + groupLabel, + }); + const endTime = performance.now(); + + singleTabMetrics["SINGLE-TAB-LR-WITH-DOMAIN-LATENCY"].push( + endTime - startTime + ); + + Assert.greaterOrEqual( + similarTabs.length, + 1, + "Logistic regression with domain should return at least one candidate" + ); + + const first = similarTabs[0]; + + Assert.equal( + first.linkedBrowser.currentURI.spec, + candidateSameDomain.linkedBrowser.currentURI.spec, + "Candidate sharing the anchors' base domain should be ranked first when text and group label match" + ); + + const titles = similarTabs.map(t => t.label); + Assert.ok( + !titles.includes("Cute cat compilation 2025"), + "An obviously unrelated tab should not be selected" + ); + + generateEmbeddingsStub.restore(); + await EngineProcess.destroyMLEngine(); + await cleanup(); + } +); + +/// test a trickier example with subdomains +add_task(async function test_clustering_nn_vs_lr_realistic_example() { + const modelHubRootUrl = Services.env.get("MOZ_MODELS_HUB"); + const { cleanup } = await perfSetup({ + prefs: [["browser.ml.modelHubRootUrl", modelHubRootUrl]], + }); + + const stgManager = new SmartTabGroupingManager(); + + let generateEmbeddingsStub = sinon.stub( + SmartTabGroupingManager.prototype, + "_generateEmbeddings" + ); + generateEmbeddingsStub.callsFake(async textList => { + return await generateEmbeddings(textList); + }); + + const anchor0 = makeUrlTab( + "https://docs.google.com/document/d/1-smart-tab-grouping-design/edit", + "Smart Tab Grouping – design document", + { groupId: "stg-group" } ); - Assert.equal(titles[4], "Hotel Deals: Save Big on Hotels with Expedia"); + const anchor1 = makeUrlTab( + "https://docs.google.com/document/d/1-smart-tab-grouping-logistic-regression/edit", + "Smart Tab Grouping – logistic regression model notes", + { groupId: "stg-group" } + ); + + const candGithub = makeUrlTab( + "https://github.com/mozilla-mobile/firefox-android/issues/999999", + "Smart Tab Grouping: tune logistic regression thresholds for mobile" + ); + const candMdn = makeUrlTab( + "https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/map", + "Array.prototype.map() – JavaScript | MDN" + ); + const candNba = makeUrlTab( + "https://www.espn.com/nba/scoreboard", + "NBA scoreboard – live scores and results" + ); + const candRecipe = makeUrlTab( + "https://www.seriouseats.com/best-lasagna-recipe", + "The very best lasagna recipe" + ); + + const allTabs = [anchor0, anchor1, candGithub, candMdn, candNba, candRecipe]; + + const groupedIndices = [0, 1]; + const alreadyGroupedIndices = []; + const groupLabel = "Smart Tab Grouping"; + + // Nearest neighbors + const nnTabs = await stgManager.findNearestNeighbors({ + allTabs, + groupedIndices, + alreadyGroupedIndices, + groupLabel, + thresholdMills: 275, + }); + + Assert.greaterOrEqual( + nnTabs.length, + 1, + "Nearest neighbors should return at least one candidate in the realistic example" + ); + + // run LR + const lrTabs = await stgManager.findSimilarTabsLogisticRegression({ + allTabs, + groupedIndices, + alreadyGroupedIndices, + groupLabel, + }); + + Assert.greaterOrEqual( + lrTabs.length, + 1, + "Logistic regression should return at least one candidate in the realistic example" + ); + + const lrTitles = lrTabs.map(t => t.label); + Assert.ok( + !lrTitles.includes("The very best lasagna recipe"), + "Logistic regression should not select a totally unrelated lasagna recipe tab" + ); + generateEmbeddingsStub.restore(); await EngineProcess.destroyMLEngine(); await cleanup();