commit 4c51134ff3ccd71bd04ccf170fcd99434657de9d
parent 2c0430bd746ec5fca77f77564b6fb23b52355c31
Author: Vasish Baungally <vbaungally@mozilla.com>
Date: Thu, 4 Dec 2025 21:20:03 +0000
Bug 2003577 - Add Domain Handling for Smart Tab Grouping. r=tarek,tabbrowser-reviewers,ai-ondevice-reviewers,sthompson
The Logistic Regression approach has a 20% improvement over the existing Nearest Neighbor implementation on our evaluation dataset (on macro F1). We'll start an experiment evaluating these two approaches soon after this patch lands.
Differential Revision: https://phabricator.services.mozilla.com/D274984
Diffstat:
5 files changed, 626 insertions(+), 52 deletions(-)
diff --git a/browser/components/tabbrowser/SmartTabGrouping.sys.mjs b/browser/components/tabbrowser/SmartTabGrouping.sys.mjs
@@ -84,7 +84,7 @@ const MAX_NON_SUMMARIZED_SEARCH_LENGTH = 26;
export const DIM_REDUCTION_METHODS = {};
const MISSING_ANCHOR_IN_CLUSTER_PENALTY = 0.2;
-const MAX_NN_GROUPED_TABS = 4;
+const MAX_NN_GROUPED_TABS = 3;
const MAX_SUGGESTED_TABS = 10;
const DISSIMILAR_TAB_LABEL = "none";
@@ -135,19 +135,25 @@ export const SMART_TAB_GROUPING_CONFIG = {
// these parameters were generated by training a logistic regression
// model on synthetic data. see https://github.com/mozilla/smart-tab-grouping
-// for more info
+// and https://github.com/mozilla/smart-tab-grouping/pull/12 for more info
const LOGISTIC_REGRESSION_PARAMS = {
+ // Logistic WITH group name
+ // Features: s_gc, s_tt_max, s_dd in [0, 1]
TITLE_WITH_GROUP_NAME: {
- GROUP_SIMILARITY_WEIGHT: 6.76420017,
- TITLE_SIMILARITY_WEIGHT: 2.95779555,
- INTERCEPT: -3.06862155,
- THRESHOLD: 0.45,
+ GROUP_SIMILARITY_WEIGHT: 0.10249,
+ TITLE_SIMILARITY_WEIGHT: 0.54897,
+ DOMAIN_SIMILARITY_WEIGHT: 0.34854,
+ INTERCEPT: -0.07397,
+ THRESHOLD: 0.59,
},
+ // Logistic WITHOUT group name
+ // Features: s_tt_max, s_dd in [0, 1]
TITLE_ONLY: {
- GROUP_SIMILARITY_WEIGHT: 0,
- TITLE_SIMILARITY_WEIGHT: 2.50596721,
- INTERCEPT: -0.54293376,
- THRESHOLD: 0.6,
+ GROUP_SIMILARITY_WEIGHT: 0, // unused in this variant
+ TITLE_SIMILARITY_WEIGHT: 0.92513,
+ DOMAIN_SIMILARITY_WEIGHT: 0.07487,
+ INTERCEPT: -2.58574,
+ THRESHOLD: 0.123,
},
};
@@ -442,8 +448,8 @@ export class SmartTabGroupingManager {
/**
* Calculates the average similarity between the anchor embeddings and the candidate embeddings
*
- * @param {list[Number]} anchorEmbeddings title embeddings for the anchor tabs
- * @param {list[Number]} candidateEmbeddings title embeddings for the candidate tabs
+ * @param {number[]} anchorEmbeddings title embeddings for the anchor tabs
+ * @param {number[]} candidateEmbeddings title embeddings for the candidate tabs
*/
getAverageSimilarity(anchorEmbeddings, candidateEmbeddings) {
let averageSimilarities = [];
@@ -458,6 +464,96 @@ export class SmartTabGroupingManager {
}
/**
+ * Calculates the max similarity between the anchor embeddings and the candidate embeddings
+ * (used for s_tt_max).
+ *
+ * @param {number[]} anchorEmbeddings title embeddings for the anchor tabs
+ * @param {number[]} candidateEmbeddings title embeddings for the candidate tabs
+ */
+ getMaxSimilarity(anchorEmbeddings, candidateEmbeddings) {
+ let maxSimilarities = [];
+ for (let candidate_embedding of candidateEmbeddings) {
+ let maxSimilarity = -1;
+ for (let anchor_embedding of anchorEmbeddings) {
+ const sim = cosSim(candidate_embedding, anchor_embedding);
+ if (sim > maxSimilarity) {
+ maxSimilarity = sim;
+ }
+ }
+ maxSimilarities.push(maxSimilarity);
+ }
+ return maxSimilarities;
+ }
+
+ /**
+ * Extract base domain from a URL with error handling
+ *
+ * @param {string} url
+ * @return {string}
+ */
+ static getBaseDomain(url) {
+ if (!url) {
+ return "";
+ }
+
+ let hostname;
+ try {
+ ({ hostname } = new URL(url));
+ } catch (_e) {
+ // invalid URL
+ return "";
+ }
+
+ if (!hostname) {
+ return "";
+ }
+
+ try {
+ // additionalParts = 1 → one label above the registrable domain
+ // then remove 'www'
+ // https://www.example.com -> www.example.com -> example.com
+ // https://www.docs.google.com -> docs.google.com
+ // https://localhost -> error
+ return Services.eTLD
+ .getBaseDomain(Services.io.newURI(url.toLowerCase()), 1)
+ .replace(/^www\./, "");
+ } catch (_e) {
+ // localhost, IPs, internal hosts, etc.
+ // bucket by the hostname.
+ return hostname.toLowerCase();
+ }
+ }
+
+ /**
+ * For each candidate tab, compute s_dd = fraction of anchors whose base domain
+ * matches the candidate's base domain.
+ *
+ * @param {Array} anchorTabsPrep output of _prepareTabData for anchor tabs
+ * @param {Array} candidateTabsPrep output of _prepareTabData for candidate tabs
+ * @return {number[]} array of s_dd values in [0, 1]
+ */
+ getDomainMatchFractions(anchorTabsPrep, candidateTabsPrep) {
+ const anchorDomains = anchorTabsPrep.map(t =>
+ SmartTabGroupingManager.getBaseDomain(t.url)
+ );
+ const numAnchors = anchorDomains.length || 1;
+
+ return candidateTabsPrep.map(tab => {
+ const candDomain = SmartTabGroupingManager.getBaseDomain(tab.url);
+ if (!candDomain) {
+ return 0;
+ }
+ let same = 0;
+ for (const ad of anchorDomains) {
+ if (ad && ad === candDomain) {
+ same++;
+ }
+ }
+ return same / numAnchors;
+ });
+ }
+
+ /**
* Calculates the sigmoid value of the input
*
* @param {number} z
@@ -470,38 +566,62 @@ export class SmartTabGroupingManager {
/**
* Calculates the probability using the linear combination of the parameters
*
- * @param {number} groupSimilarity how similar a candidate tab is to the group name
- * @param {number} titleSimilarity how similar a candidate tab is to the anchors
+ * @param {number} groupSimilarity s_gc in [0,1]
+ * @param {number} titleSimilarity s_tt_max in [0,1]
+ * @param {number} domainSimilarity s_dd in [0,1]
* @param {object} params the logistic regression weights assigned to each parameter
* @return {number}
*/
- calculateProbability(groupSimilarity, titleSimilarity, params) {
- return this.sigmoid(
- groupSimilarity * params.GROUP_SIMILARITY_WEIGHT +
- titleSimilarity * params.TITLE_SIMILARITY_WEIGHT +
- params.INTERCEPT
- );
+ calculateProbability(
+ groupSimilarity,
+ titleSimilarity,
+ domainSimilarity,
+ params
+ ) {
+ const wGroup = params.GROUP_SIMILARITY_WEIGHT || 0;
+ const wTitle = params.TITLE_SIMILARITY_WEIGHT || 0;
+ const wDomain = params.DOMAIN_SIMILARITY_WEIGHT || 0;
+ const z =
+ groupSimilarity * wGroup +
+ titleSimilarity * wTitle +
+ domainSimilarity * wDomain +
+ params.INTERCEPT;
+ return this.sigmoid(z);
}
/**
- * Calculates the probabilities given two lists of the same length
+ * Calculates the probabilities given similarity lists (cosine) and domain fractions.
*
- * @param {list[Number]} groupSimilarities cosine similarity between the candidate tabs and the group name
- * @param {list[Number]} titleSimilarities average cosine similarity between the candidate tabs and anchors
- * @return {list[Number]} probabilities for each candidate tab
+ * @param {number[]|null} groupSimilaritiesCos cosine(group, candidate) in [-1,1] or null
+ * @param {number[]} titleSimilaritiesCos max cosine(anchor, candidate) in [-1,1]
+ * @param {number[]} domainSimilarities s_dd in [0,1]
+ * @return {number[]} probabilities for each candidate tab
*/
- calculateAllProbabilities(groupSimilarities, titleSimilarities) {
- const hasGroupSimilarity = Boolean(groupSimilarities);
- let probabilities = [];
- for (let i = 0; i < titleSimilarities.length; i++) {
+ calculateAllProbabilities(
+ groupSimilaritiesCos,
+ titleSimilaritiesCos,
+ domainSimilarities
+ ) {
+ const hasGroupSimilarity =
+ Array.isArray(groupSimilaritiesCos) && groupSimilaritiesCos.length;
+ const useDomain =
+ Array.isArray(domainSimilarities) && domainSimilarities.length;
+
+ const probabilities = [];
+ for (let i = 0; i < titleSimilaritiesCos.length; i++) {
+ // groupTitleSim and titleSim are (cos + 1)/2 -> [0,1]
+ const groupTitleSim = hasGroupSimilarity
+ ? 0.5 * (groupSimilaritiesCos[i] + 1)
+ : 0;
+ const titleSim = 0.5 * (titleSimilaritiesCos[i] + 1);
+ const domainSim = useDomain ? domainSimilarities[i] : 0;
+
+ const params = hasGroupSimilarity
+ ? LOGISTIC_REGRESSION_PARAMS.TITLE_WITH_GROUP_NAME
+ : LOGISTIC_REGRESSION_PARAMS.TITLE_ONLY;
+
probabilities.push(
- this.calculateProbability(
- hasGroupSimilarity ? groupSimilarities[i] : 0,
- titleSimilarities[i],
- hasGroupSimilarity
- ? LOGISTIC_REGRESSION_PARAMS.TITLE_WITH_GROUP_NAME
- : LOGISTIC_REGRESSION_PARAMS.TITLE_ONLY
- )
+ this.calculateProbability(groupTitleSim, titleSim, domainSim, params)
);
}
return probabilities;
@@ -543,28 +663,35 @@ export class SmartTabGroupingManager {
);
let groupEmbedding;
- let groupSimilarities;
+ let groupSimilaritiesCos = null;
if (groupLabel) {
groupEmbedding = await this._generateEmbeddings([groupLabel]);
- // calculate similarity between the group and the candidate tabs if group name is present
- groupSimilarities = this.getAverageSimilarity(
+ // cosine(group, candidate_title) in [-1,1]
+ groupSimilaritiesCos = this.getAverageSimilarity(
groupEmbedding,
titleEmbeddings.slice(anchorTabsPrep.length)
);
}
- // calculate the similarity between the anchors and candidate titles
- const titleSimilarities = this.getAverageSimilarity(
+ // s_tt_max: max cosine(anchor_title, candidate_title) in [-1,1]
+ const titleSimilaritiesCos = this.getMaxSimilarity(
titleEmbeddings.slice(0, anchorTabsPrep.length),
titleEmbeddings.slice(anchorTabsPrep.length)
);
+ // s_dd: fraction of anchors sharing the candidate's base domain
+ const domainSimilarities = this.getDomainMatchFractions(
+ anchorTabsPrep,
+ candidateTabsPrep
+ );
+
const candidateProbabilities = this.calculateAllProbabilities(
- groupSimilarities,
- titleSimilarities
+ groupSimilaritiesCos,
+ titleSimilaritiesCos,
+ domainSimilarities
);
- // get proper params depending on group name availability
+ // get matching params depending on the group name availability
const probabilityThreshold = groupEmbedding
? LOGISTIC_REGRESSION_PARAMS.TITLE_WITH_GROUP_NAME.THRESHOLD
: LOGISTIC_REGRESSION_PARAMS.TITLE_ONLY.THRESHOLD;
diff --git a/browser/components/tabbrowser/test/xpcshell/smarttabgrouping/test_calculate_probability.js b/browser/components/tabbrowser/test/xpcshell/smarttabgrouping/test_calculate_probability.js
@@ -10,13 +10,16 @@ add_task(function test_calculate_probability_zero_inputs() {
const params = {
GROUP_SIMILARITY_WEIGHT: 1,
TITLE_SIMILARITY_WEIGHT: 1,
+ DOMAIN_SIMILARITY_WEIGHT: 1,
INTERCEPT: 0,
};
const groupSim = 0;
const titleSim = 0;
+ const domainSim = 0;
const result = smartTabGroupingManager.calculateProbability(
groupSim,
titleSim,
+ domainSim,
params
);
const expected = 1 / (1 + Math.exp(0)); // sigmoid(0) = 0.5
@@ -32,16 +35,19 @@ add_task(function test_calculate_probability_both_positive() {
const params = {
GROUP_SIMILARITY_WEIGHT: 1,
TITLE_SIMILARITY_WEIGHT: 1,
+ DOMAIN_SIMILARITY_WEIGHT: 1,
INTERCEPT: 0,
};
const groupSim = 1;
const titleSim = 1;
+ const domainSim = 1;
const result = smartTabGroupingManager.calculateProbability(
groupSim,
titleSim,
+ domainSim,
params
);
- const expected = 1 / (1 + Math.exp(-2));
+ const expected = 1 / (1 + Math.exp(-3));
Assert.equal(
result.toPrecision(4),
expected.toPrecision(4),
@@ -54,13 +60,16 @@ add_task(function test_calculate_probability_mixed_values() {
const params = {
GROUP_SIMILARITY_WEIGHT: 2,
TITLE_SIMILARITY_WEIGHT: 3,
+ DOMAIN_SIMILARITY_WEIGHT: 0,
INTERCEPT: 0.5,
};
const groupSim = 1;
const titleSim = -1;
+ const domainSim = -1;
const result = smartTabGroupingManager.calculateProbability(
groupSim,
titleSim,
+ domainSim,
params
);
const expected = 1 / (1 + Math.exp(0.5)); // sigmoid(-0.5)
@@ -76,13 +85,16 @@ add_task(function test_calculate_probability_zero_weights() {
const params = {
GROUP_SIMILARITY_WEIGHT: 0,
TITLE_SIMILARITY_WEIGHT: 0,
+ DOMAIN_SIMILARITY_WEIGHT: 0,
INTERCEPT: 0,
};
const groupSim = 5;
const titleSim = -3;
+ const domainSim = 1;
const result = smartTabGroupingManager.calculateProbability(
groupSim,
titleSim,
+ domainSim,
params
);
const expected = 1 / (1 + Math.exp(0)); // sigmoid(0) = 0.5
@@ -98,16 +110,19 @@ add_task(function test_calculate_probability_extreme_positive() {
const params = {
GROUP_SIMILARITY_WEIGHT: 1,
TITLE_SIMILARITY_WEIGHT: 1,
+ DOMAIN_SIMILARITY_WEIGHT: 1,
INTERCEPT: 0,
};
const groupSim = 10;
const titleSim = 10;
+ const domainSim = 10;
const result = smartTabGroupingManager.calculateProbability(
groupSim,
titleSim,
+ domainSim,
params
);
- const expected = 1 / (1 + Math.exp(-20));
+ const expected = 1 / (1 + Math.exp(-30));
Assert.equal(
result.toPrecision(4),
expected.toPrecision(4),
@@ -120,16 +135,19 @@ add_task(function test_calculate_probability_extreme_negative() {
const params = {
GROUP_SIMILARITY_WEIGHT: 1,
TITLE_SIMILARITY_WEIGHT: 1,
+ DOMAIN_SIMILARITY_WEIGHT: 1,
INTERCEPT: 0,
};
const groupSim = -10;
const titleSim = -10;
+ const domainSim = -10;
const result = smartTabGroupingManager.calculateProbability(
groupSim,
titleSim,
+ domainSim,
params
);
- const expected = 1 / (1 + Math.exp(20));
+ const expected = 1 / (1 + Math.exp(30));
Assert.equal(
result.toPrecision(4),
expected.toPrecision(4),
@@ -142,13 +160,16 @@ add_task(function test_calculate_probability_negative_intercept() {
const params = {
GROUP_SIMILARITY_WEIGHT: 1,
TITLE_SIMILARITY_WEIGHT: 1,
+ DOMAIN_SIMILARITY_WEIGHT: 0,
INTERCEPT: -1,
};
const groupSim = 0.5;
const titleSim = 0.5;
+ const domainSim = 0.5;
const result = smartTabGroupingManager.calculateProbability(
groupSim,
titleSim,
+ domainSim,
params
);
const expected = 1 / (1 + Math.exp(0)); // sigmoid(0) = 0.5
diff --git a/browser/components/tabbrowser/test/xpcshell/smarttabgrouping/test_logistic_regression_utils.js b/browser/components/tabbrowser/test/xpcshell/smarttabgrouping/test_logistic_regression_utils.js
@@ -0,0 +1,225 @@
+/* Any copyright is dedicated to the Public Domain.
+ * http://creativecommons.org/publicdomain/zero/1.0/ */
+
+const { SmartTabGroupingManager } = ChromeUtils.importESModule(
+ "moz-src:///browser/components/tabbrowser/SmartTabGrouping.sys.mjs"
+);
+
+add_task(function test_logistic_regression_get_base_domain() {
+ // Basic HTTPS URL with www
+ Assert.equal(
+ SmartTabGroupingManager.getBaseDomain("https://www.example.com/path"),
+ "example.com",
+ "www.example.com should normalize to example.com"
+ );
+
+ // Multiple subdomains
+ Assert.equal(
+ SmartTabGroupingManager.getBaseDomain("https://docs.example.com"),
+ "docs.example.com",
+ "Should keep last subdomain + baseDomain"
+ );
+
+ // Hosted services like blogs
+ Assert.equal(
+ SmartTabGroupingManager.getBaseDomain("https://myblog.example.com/"),
+ "myblog.example.com",
+ "Should bucket per hosted subdomain (blog, docs, etc.)"
+ );
+
+ // Host without dots
+ Assert.equal(
+ SmartTabGroupingManager.getBaseDomain("http://localhost"),
+ "localhost",
+ "Should return hostname as-is when there is no dot"
+ );
+
+ // Invalid / empty URL should be handled gracefully
+ Assert.equal(
+ SmartTabGroupingManager.getBaseDomain(""),
+ "",
+ "Invalid URL should return empty string"
+ );
+});
+
+add_task(function test_logistic_regression_domain_match_fractions() {
+ const mgr = new SmartTabGroupingManager();
+
+ const anchors = [
+ { url: "https://a.com/foo" },
+ { url: "https://www.a.com/bar" },
+ { url: "https://b.com/baz" },
+ ];
+ const candidates = [
+ { url: "https://a.com/other" }, // matches 2 of 3 anchors
+ { url: "https://b.com/other" }, // matches 1 of 3 anchors
+ { url: "https://c.com/other" }, // matches 0 of 3 anchors
+ { url: "" }, // invalid / empty URL
+ ];
+
+ const fractions = mgr.getDomainMatchFractions(anchors, candidates);
+
+ Assert.equal(
+ fractions.length,
+ candidates.length,
+ "Should return one value per candidate"
+ );
+
+ Assert.less(
+ Math.abs(fractions[0] - 2 / 3),
+ 1e-6,
+ "Candidate with domain matching two of three anchors should have fraction 2/3"
+ );
+
+ Assert.less(
+ Math.abs(fractions[1] - 1 / 3),
+ 1e-6,
+ "Candidate with domain matching one of three anchors should have fraction 1/3"
+ );
+
+ Assert.equal(
+ fractions[2],
+ 0,
+ "Candidate with domain not matching any anchor should have fraction 0"
+ );
+
+ Assert.equal(
+ fractions[3],
+ 0,
+ "Candidate with invalid URL should have fraction 0"
+ );
+});
+
+add_task(function test_logistic_regression_get_max_similarity() {
+ const mgr = new SmartTabGroupingManager();
+
+ const anchors = [
+ [1, 0],
+ [0, 1],
+ ];
+ const candidates = [
+ [1, 0], // identical to first anchor -> cos ~ 1
+ [0.5, 0.5], // at 45 degrees -> cos ~ 0.707 with either anchor
+ ];
+
+ const maxSims = mgr.getMaxSimilarity(anchors, candidates);
+
+ Assert.equal(
+ maxSims.length,
+ candidates.length,
+ "Should return one max similarity per candidate"
+ );
+
+ Assert.less(
+ Math.abs(maxSims[0] - 1),
+ 1e-6,
+ "First candidate identical to first anchor should have cosine similarity ~1"
+ );
+
+ Assert.ok(
+ maxSims[1] > 0.7 && maxSims[1] < 0.8,
+ "Second candidate should have cosine similarity ~sqrt(1/2) ≈ 0.707 with at least one anchor"
+ );
+});
+
+add_task(function test_logistic_regression_sigmoid_and_calculate_probability() {
+ const mgr = new SmartTabGroupingManager();
+
+ // Basic sigmoid sanity checks
+ Assert.less(Math.abs(mgr.sigmoid(0) - 0.5), 1e-6, "sigmoid(0) should be 0.5");
+
+ Assert.greater(
+ mgr.sigmoid(10),
+ 0.99,
+ "sigmoid of large positive number should be close to 1"
+ );
+
+ Assert.less(
+ mgr.sigmoid(-10),
+ 0.01,
+ "sigmoid of large negative number should be close to 0"
+ );
+
+ // Check that calculateProbability matches explicit linear combination + sigmoid
+ const params = {
+ GROUP_SIMILARITY_WEIGHT: 1,
+ TITLE_SIMILARITY_WEIGHT: 2,
+ DOMAIN_SIMILARITY_WEIGHT: 3,
+ INTERCEPT: 0,
+ };
+
+ const s_gc = 0.5;
+ const s_tt = 0.5;
+ const s_dd = 0.5;
+
+ const prob = mgr.calculateProbability(s_gc, s_tt, s_dd, params);
+ const expectedZ = s_gc * 1 + s_tt * 2 + s_dd * 3; // 3
+ const expectedProb = mgr.sigmoid(expectedZ);
+
+ Assert.less(
+ Math.abs(prob - expectedProb),
+ 1e-6,
+ "calculateProbability should equal sigmoid(linear combination of features and weights)"
+ );
+});
+
+add_task(
+ function test_logistic_regression_calculate_all_probabilities_with_group() {
+ const mgr = new SmartTabGroupingManager();
+
+ // cos = 0 for both candidates -> s_gc = s_tt_max = 0.5 for both
+ const groupSimilaritiesCos = [0, 0];
+ const titleSimilaritiesCos = [0, 0];
+
+ // Candidate 0 has full domain match, candidate 1 has none.
+ const domainSimilarities = [1, 0];
+
+ const probs = mgr.calculateAllProbabilities(
+ groupSimilaritiesCos,
+ titleSimilaritiesCos,
+ domainSimilarities
+ );
+
+ Assert.equal(
+ probs.length,
+ 2,
+ "Should return one probability per candidate"
+ );
+
+ Assert.greater(
+ probs[0],
+ probs[1],
+ "With group present, candidate with higher domain match fraction should have higher probability"
+ );
+ }
+);
+
+add_task(
+ function test_logistic_regression_calculate_all_probabilities_without_group() {
+ const mgr = new SmartTabGroupingManager();
+
+ // cos = 0 for both candidates -> s_tt_max = 0.5 for both
+ const titleSimilaritiesCos = [0, 0];
+
+ // Candidate 0 has full domain match, candidate 1 has none.
+ const domainSimilarities = [1, 0];
+
+ const probs = mgr.calculateAllProbabilities(
+ null, // no group similarities -> TITLE_ONLY params
+ titleSimilaritiesCos,
+ domainSimilarities
+ );
+
+ Assert.equal(
+ probs.length,
+ 2,
+ "Should return one probability per candidate"
+ );
+
+ Assert.greater(
+ probs[0],
+ probs[1],
+ "Without group, candidate with higher domain match fraction should have higher probability"
+ );
+ }
+);
diff --git a/browser/components/tabbrowser/test/xpcshell/smarttabgrouping/xpcshell.toml b/browser/components/tabbrowser/test/xpcshell/smarttabgrouping/xpcshell.toml
@@ -8,4 +8,6 @@ firefox-appdir = "browser"
["test_calculate_probability.js"]
+["test_logistic_regression_utils.js"]
+
["test_text_preprocessing.js"]
diff --git a/toolkit/components/ml/tests/browser/browser_ml_smart_tab_clustering_perf.js b/toolkit/components/ml/tests/browser/browser_ml_smart_tab_clustering_perf.js
@@ -111,10 +111,27 @@ async function runTopicModel(texts, keywords = []) {
return output.map(o => o.generated_text);
}
+// build tab object similar to what we'd expect for an actual tab
+function makeUrlTab(url, label, { groupId = null } = {}) {
+ return {
+ label,
+ url,
+ group: groupId,
+ pinned: false,
+ linkedBrowser: {
+ currentURI: {
+ spec: url,
+ },
+ },
+ };
+}
+
const singleTabMetrics = {};
singleTabMetrics["SINGLE-TAB-LATENCY"] = [];
singleTabMetrics["SINGLE-TAB-LOGISTIC-REGRESSION-LATENCY"] = [];
singleTabMetrics["SINGLE-TAB-TOPIC-LATENCY"] = [];
+// measure latency with domain feature
+singleTabMetrics["SINGLE-TAB-LR-WITH-DOMAIN-LATENCY"] = [];
add_task(async function test_clustering_nearest_neighbors() {
const modelHubRootUrl = Services.env.get("MOZ_MODELS_HUB");
@@ -142,7 +159,7 @@ add_task(async function test_clustering_nearest_neighbors() {
groupedIndices: [1],
alreadyGroupedIndices: [],
groupLabel: "Travel Planning",
- thresholdMills: 300,
+ thresholdMills: 275,
});
const endTime = performance.now();
singleTabMetrics["SINGLE-TAB-LATENCY"].push(endTime - startTime);
@@ -205,7 +222,7 @@ add_task(async function test_clustering_logistic_regression() {
const titles = similarTabs.map(s => s.label);
Assert.equal(
titles.length,
- 5,
+ 3,
"Proper number of similar tabs should be returned"
);
Assert.equal(
@@ -217,11 +234,193 @@ add_task(async function test_clustering_logistic_regression() {
"Impact of Tourism on Local Communities - Google Scholar"
);
Assert.equal(titles[2], "Cheap Flights, Airline Tickets & Airfare Deals");
- Assert.equal(
- titles[3],
- "The Influence of Travel Restrictions on the Spread of COVID-19 - Nature"
+ generateEmbeddingsStub.restore();
+ await EngineProcess.destroyMLEngine();
+ await cleanup();
+});
+
+// test domain feature for Logistic Regression
+add_task(
+ async function test_clustering_logistic_regression_domain_preference() {
+ const modelHubRootUrl = Services.env.get("MOZ_MODELS_HUB");
+ const { cleanup } = await perfSetup({
+ prefs: [["browser.ml.modelHubRootUrl", modelHubRootUrl]],
+ });
+
+ const stgManager = new SmartTabGroupingManager();
+
+ let generateEmbeddingsStub = sinon.stub(
+ SmartTabGroupingManager.prototype,
+ "_generateEmbeddings"
+ );
+ generateEmbeddingsStub.callsFake(async textList => {
+ return await generateEmbeddings(textList);
+ });
+
+ const sharedTitle = "Smart Tab Grouping deep dive";
+
+ const anchor0 = makeUrlTab(
+ "https://docs.google.com/document/d/1-smart-tab-grouping-deep-dive/edit",
+ sharedTitle,
+ { groupId: "stg-group" }
+ );
+ const anchor1 = makeUrlTab(
+ "https://docs.google.com/document/d/1-smart-tab-grouping-deep-dive-2/edit",
+ sharedTitle,
+ { groupId: "stg-group" }
+ );
+
+ const candidateSameDomain = makeUrlTab(
+ "https://docs.google.com/document/d/1-smart-tab-grouping-deep-dive-3/edit",
+ sharedTitle
+ );
+ const candidateOtherDomain = makeUrlTab(
+ "https://example.com/smart-tab-grouping-deep-dive-3",
+ sharedTitle
+ );
+
+ const unrelated = makeUrlTab(
+ "https://www.youtube.com/watch?v=xyz",
+ "Cute cat compilation 2025"
+ );
+
+ const allTabs = [
+ anchor0,
+ anchor1,
+ candidateSameDomain,
+ candidateOtherDomain,
+ unrelated,
+ ];
+
+ const groupedIndices = [0, 1];
+ const alreadyGroupedIndices = [];
+ const groupLabel = sharedTitle;
+
+ const startTime = performance.now();
+ const similarTabs = await stgManager.findSimilarTabsLogisticRegression({
+ allTabs,
+ groupedIndices,
+ alreadyGroupedIndices,
+ groupLabel,
+ });
+ const endTime = performance.now();
+
+ singleTabMetrics["SINGLE-TAB-LR-WITH-DOMAIN-LATENCY"].push(
+ endTime - startTime
+ );
+
+ Assert.greaterOrEqual(
+ similarTabs.length,
+ 1,
+ "Logistic regression with domain should return at least one candidate"
+ );
+
+ const first = similarTabs[0];
+
+ Assert.equal(
+ first.linkedBrowser.currentURI.spec,
+ candidateSameDomain.linkedBrowser.currentURI.spec,
+ "Candidate sharing the anchors' base domain should be ranked first when text and group label match"
+ );
+
+ const titles = similarTabs.map(t => t.label);
+ Assert.ok(
+ !titles.includes("Cute cat compilation 2025"),
+ "An obviously unrelated tab should not be selected"
+ );
+
+ generateEmbeddingsStub.restore();
+ await EngineProcess.destroyMLEngine();
+ await cleanup();
+ }
+);
+
+/// test a trickier example with subdomains
+add_task(async function test_clustering_nn_vs_lr_realistic_example() {
+ const modelHubRootUrl = Services.env.get("MOZ_MODELS_HUB");
+ const { cleanup } = await perfSetup({
+ prefs: [["browser.ml.modelHubRootUrl", modelHubRootUrl]],
+ });
+
+ const stgManager = new SmartTabGroupingManager();
+
+ let generateEmbeddingsStub = sinon.stub(
+ SmartTabGroupingManager.prototype,
+ "_generateEmbeddings"
+ );
+ generateEmbeddingsStub.callsFake(async textList => {
+ return await generateEmbeddings(textList);
+ });
+
+ const anchor0 = makeUrlTab(
+ "https://docs.google.com/document/d/1-smart-tab-grouping-design/edit",
+ "Smart Tab Grouping – design document",
+ { groupId: "stg-group" }
);
- Assert.equal(titles[4], "Hotel Deals: Save Big on Hotels with Expedia");
+ const anchor1 = makeUrlTab(
+ "https://docs.google.com/document/d/1-smart-tab-grouping-logistic-regression/edit",
+ "Smart Tab Grouping – logistic regression model notes",
+ { groupId: "stg-group" }
+ );
+
+ const candGithub = makeUrlTab(
+ "https://github.com/mozilla-mobile/firefox-android/issues/999999",
+ "Smart Tab Grouping: tune logistic regression thresholds for mobile"
+ );
+ const candMdn = makeUrlTab(
+ "https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/map",
+ "Array.prototype.map() – JavaScript | MDN"
+ );
+ const candNba = makeUrlTab(
+ "https://www.espn.com/nba/scoreboard",
+ "NBA scoreboard – live scores and results"
+ );
+ const candRecipe = makeUrlTab(
+ "https://www.seriouseats.com/best-lasagna-recipe",
+ "The very best lasagna recipe"
+ );
+
+ const allTabs = [anchor0, anchor1, candGithub, candMdn, candNba, candRecipe];
+
+ const groupedIndices = [0, 1];
+ const alreadyGroupedIndices = [];
+ const groupLabel = "Smart Tab Grouping";
+
+ // Nearest neighbors
+ const nnTabs = await stgManager.findNearestNeighbors({
+ allTabs,
+ groupedIndices,
+ alreadyGroupedIndices,
+ groupLabel,
+ thresholdMills: 275,
+ });
+
+ Assert.greaterOrEqual(
+ nnTabs.length,
+ 1,
+ "Nearest neighbors should return at least one candidate in the realistic example"
+ );
+
+ // run LR
+ const lrTabs = await stgManager.findSimilarTabsLogisticRegression({
+ allTabs,
+ groupedIndices,
+ alreadyGroupedIndices,
+ groupLabel,
+ });
+
+ Assert.greaterOrEqual(
+ lrTabs.length,
+ 1,
+ "Logistic regression should return at least one candidate in the realistic example"
+ );
+
+ const lrTitles = lrTabs.map(t => t.label);
+ Assert.ok(
+ !lrTitles.includes("The very best lasagna recipe"),
+ "Logistic regression should not select a totally unrelated lasagna recipe tab"
+ );
+
generateEmbeddingsStub.restore();
await EngineProcess.destroyMLEngine();
await cleanup();