[ tor-browser ].git.dasho

commit 4c51134ff3ccd71bd04ccf170fcd99434657de9d
parent 2c0430bd746ec5fca77f77564b6fb23b52355c31
Author: Vasish Baungally <vbaungally@mozilla.com>
Date:   Thu,  4 Dec 2025 21:20:03 +0000

Bug 2003577 - Add Domain Handling for Smart Tab Grouping. r=tarek,tabbrowser-reviewers,ai-ondevice-reviewers,sthompson

The Logistic Regression approach has a 20% improvement over the existing Nearest Neighbor implementation on our evaluation dataset (on macro F1). We'll start an experiment evaluating these two approaches soon after this patch lands.

Differential Revision: https://phabricator.services.mozilla.com/D274984

Diffstat:
M browser/components/tabbrowser/SmartTabGrouping.sys.mjs  | 213 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
M browser/components/tabbrowser/test/xpcshell/smarttabgrouping/test_calculate_probability.js  | 27 ++++++++++++++++++++++++---
A browser/components/tabbrowser/test/xpcshell/smarttabgrouping/test_logistic_regression_utils.js  | 225 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M browser/components/tabbrowser/test/xpcshell/smarttabgrouping/xpcshell.toml  | 2 ++
M toolkit/components/ml/tests/browser/browser_ml_smart_tab_clustering_perf.js  | 211 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---

5 files changed, 626 insertions(+), 52 deletions(-)
diff --git a/browser/components/tabbrowser/SmartTabGrouping.sys.mjs b/browser/components/tabbrowser/SmartTabGrouping.sys.mjs
@@ -84,7 +84,7 @@ const MAX_NON_SUMMARIZED_SEARCH_LENGTH = 26;
 
 export const DIM_REDUCTION_METHODS = {};
 const MISSING_ANCHOR_IN_CLUSTER_PENALTY = 0.2;
-const MAX_NN_GROUPED_TABS = 4;
+const MAX_NN_GROUPED_TABS = 3;
 const MAX_SUGGESTED_TABS = 10;
 
 const DISSIMILAR_TAB_LABEL = "none";
@@ -135,19 +135,25 @@ export const SMART_TAB_GROUPING_CONFIG = {
 
 // these parameters were generated by training a logistic regression
 // model on synthetic data. see https://github.com/mozilla/smart-tab-grouping
-// for more info
+// and https://github.com/mozilla/smart-tab-grouping/pull/12 for more info
 const LOGISTIC_REGRESSION_PARAMS = {
+  // Logistic WITH group name
+  // Features: s_gc, s_tt_max, s_dd in [0, 1]
   TITLE_WITH_GROUP_NAME: {
-    GROUP_SIMILARITY_WEIGHT: 6.76420017,
-    TITLE_SIMILARITY_WEIGHT: 2.95779555,
-    INTERCEPT: -3.06862155,
-    THRESHOLD: 0.45,
+    GROUP_SIMILARITY_WEIGHT: 0.10249,
+    TITLE_SIMILARITY_WEIGHT: 0.54897,
+    DOMAIN_SIMILARITY_WEIGHT: 0.34854,
+    INTERCEPT: -0.07397,
+    THRESHOLD: 0.59,
   },
+  // Logistic WITHOUT group name
+  // Features: s_tt_max, s_dd in [0, 1]
   TITLE_ONLY: {
-    GROUP_SIMILARITY_WEIGHT: 0,
-    TITLE_SIMILARITY_WEIGHT: 2.50596721,
-    INTERCEPT: -0.54293376,
-    THRESHOLD: 0.6,
+    GROUP_SIMILARITY_WEIGHT: 0, // unused in this variant
+    TITLE_SIMILARITY_WEIGHT: 0.92513,
+    DOMAIN_SIMILARITY_WEIGHT: 0.07487,
+    INTERCEPT: -2.58574,
+    THRESHOLD: 0.123,
   },
 };
 
@@ -442,8 +448,8 @@ export class SmartTabGroupingManager {
   /**
    * Calculates the average similarity between the anchor embeddings and the candidate embeddings
    *
-   * @param {list[Number]} anchorEmbeddings title embeddings for the anchor tabs
-   * @param {list[Number]} candidateEmbeddings title embeddings for the candidate tabs
+   * @param {number[]} anchorEmbeddings title embeddings for the anchor tabs
+   * @param {number[]} candidateEmbeddings title embeddings for the candidate tabs
    */
   getAverageSimilarity(anchorEmbeddings, candidateEmbeddings) {
     let averageSimilarities = [];
@@ -458,6 +464,96 @@ export class SmartTabGroupingManager {
   }
 
   /**
+   * Calculates the max similarity between the anchor embeddings and the candidate embeddings
+   * (used for s_tt_max).
+   *
+   * @param {number[]} anchorEmbeddings title embeddings for the anchor tabs
+   * @param {number[]} candidateEmbeddings title embeddings for the candidate tabs
+   */
+  getMaxSimilarity(anchorEmbeddings, candidateEmbeddings) {
+    let maxSimilarities = [];
+    for (let candidate_embedding of candidateEmbeddings) {
+      let maxSimilarity = -1;
+      for (let anchor_embedding of anchorEmbeddings) {
+        const sim = cosSim(candidate_embedding, anchor_embedding);
+        if (sim > maxSimilarity) {
+          maxSimilarity = sim;
+        }
+      }
+      maxSimilarities.push(maxSimilarity);
+    }
+    return maxSimilarities;
+  }
+
+  /**
+   * Extract base domain from a URL with error handling
+   *
+   * @param {string} url
+   * @return {string}
+   */
+  static getBaseDomain(url) {
+    if (!url) {
+      return "";
+    }
+
+    let hostname;
+    try {
+      ({ hostname } = new URL(url));
+    } catch (_e) {
+      // invalid URL
+      return "";
+    }
+
+    if (!hostname) {
+      return "";
+    }
+
+    try {
+      // additionalParts = 1 → one label above the registrable domain
+      // then remove 'www'
+      // https://www.example.com -> www.example.com -> example.com
+      // https://www.docs.google.com -> docs.google.com
+      // https://localhost -> error
+      return Services.eTLD
+        .getBaseDomain(Services.io.newURI(url.toLowerCase()), 1)
+        .replace(/^www\./, "");
+    } catch (_e) {
+      // localhost, IPs, internal hosts, etc.
+      // bucket by the hostname.
+      return hostname.toLowerCase();
+    }
+  }
+
+  /**
+   * For each candidate tab, compute s_dd = fraction of anchors whose base domain
+   * matches the candidate's base domain.
+   *
+   * @param {Array} anchorTabsPrep  output of _prepareTabData for anchor tabs
+   * @param {Array} candidateTabsPrep output of _prepareTabData for candidate tabs
+   * @return {number[]} array of s_dd values in [0, 1]
+   */
+  getDomainMatchFractions(anchorTabsPrep, candidateTabsPrep) {
+    const anchorDomains = anchorTabsPrep.map(t =>
+      SmartTabGroupingManager.getBaseDomain(t.url)
+    );
+    const numAnchors = anchorDomains.length || 1;
+
+    return candidateTabsPrep.map(tab => {
+      const candDomain = SmartTabGroupingManager.getBaseDomain(tab.url);
+      if (!candDomain) {
+        return 0;
+      }
+      let same = 0;
+      for (const ad of anchorDomains) {
+        if (ad && ad === candDomain) {
+          same++;
+        }
+      }
+      return same / numAnchors;
+    });
+  }
+
+  /**
    * Calculates the sigmoid value of the input
    *
    * @param {number} z
@@ -470,38 +566,62 @@ export class SmartTabGroupingManager {
   /**
    * Calculates the probability using the linear combination of the parameters
    *
-   * @param {number} groupSimilarity how similar a candidate tab is to the group name
-   * @param {number} titleSimilarity how similar a candidate tab is to the anchors
+   * @param {number} groupSimilarity s_gc in [0,1]
+   * @param {number} titleSimilarity s_tt_max in [0,1]
+   * @param {number} domainSimilarity s_dd in [0,1]
    * @param {object} params the logistic regression weights assigned to each parameter
    * @return {number}
    */
-  calculateProbability(groupSimilarity, titleSimilarity, params) {
-    return this.sigmoid(
-      groupSimilarity * params.GROUP_SIMILARITY_WEIGHT +
-        titleSimilarity * params.TITLE_SIMILARITY_WEIGHT +
-        params.INTERCEPT
-    );
+  calculateProbability(
+    groupSimilarity,
+    titleSimilarity,
+    domainSimilarity,
+    params
+  ) {
+    const wGroup = params.GROUP_SIMILARITY_WEIGHT || 0;
+    const wTitle = params.TITLE_SIMILARITY_WEIGHT || 0;
+    const wDomain = params.DOMAIN_SIMILARITY_WEIGHT || 0;
+    const z =
+      groupSimilarity * wGroup +
+      titleSimilarity * wTitle +
+      domainSimilarity * wDomain +
+      params.INTERCEPT;
+    return this.sigmoid(z);
   }
 
   /**
-   * Calculates the probabilities given two lists of the same length
+   * Calculates the probabilities given similarity lists (cosine) and domain fractions.
    *
-   * @param {list[Number]} groupSimilarities cosine similarity between the candidate tabs and the group name
-   * @param {list[Number]} titleSimilarities average cosine similarity between the candidate tabs and anchors
-   * @return {list[Number]} probabilities for each candidate tab
+   * @param {number[]|null} groupSimilaritiesCos cosine(group, candidate) in [-1,1] or null
+   * @param {number[]} titleSimilaritiesCos max cosine(anchor, candidate) in [-1,1]
+   * @param {number[]} domainSimilarities s_dd in [0,1]
+   * @return {number[]} probabilities for each candidate tab
    */
-  calculateAllProbabilities(groupSimilarities, titleSimilarities) {
-    const hasGroupSimilarity = Boolean(groupSimilarities);
-    let probabilities = [];
-    for (let i = 0; i < titleSimilarities.length; i++) {
+  calculateAllProbabilities(
+    groupSimilaritiesCos,
+    titleSimilaritiesCos,
+    domainSimilarities
+  ) {
+    const hasGroupSimilarity =
+      Array.isArray(groupSimilaritiesCos) && groupSimilaritiesCos.length;
+    const useDomain =
+      Array.isArray(domainSimilarities) && domainSimilarities.length;
+
+    const probabilities = [];
+    for (let i = 0; i < titleSimilaritiesCos.length; i++) {
+      // groupTitleSim and titleSim are (cos + 1)/2 -> [0,1]
+      const groupTitleSim = hasGroupSimilarity
+        ? 0.5 * (groupSimilaritiesCos[i] + 1)
+        : 0;
+      const titleSim = 0.5 * (titleSimilaritiesCos[i] + 1);
+      const domainSim = useDomain ? domainSimilarities[i] : 0;
+
+      const params = hasGroupSimilarity
+        ? LOGISTIC_REGRESSION_PARAMS.TITLE_WITH_GROUP_NAME
+        : LOGISTIC_REGRESSION_PARAMS.TITLE_ONLY;
+
       probabilities.push(
-        this.calculateProbability(
-          hasGroupSimilarity ? groupSimilarities[i] : 0,
-          titleSimilarities[i],
-          hasGroupSimilarity
-            ? LOGISTIC_REGRESSION_PARAMS.TITLE_WITH_GROUP_NAME
-            : LOGISTIC_REGRESSION_PARAMS.TITLE_ONLY
-        )
+        this.calculateProbability(groupTitleSim, titleSim, domainSim, params)
       );
     }
     return probabilities;
@@ -543,28 +663,35 @@ export class SmartTabGroupingManager {
     );
 
     let groupEmbedding;
-    let groupSimilarities;
+    let groupSimilaritiesCos = null;
     if (groupLabel) {
       groupEmbedding = await this._generateEmbeddings([groupLabel]);
-      // calculate similarity between the group and the candidate tabs if group name is present
-      groupSimilarities = this.getAverageSimilarity(
+      // cosine(group, candidate_title) in [-1,1]
+      groupSimilaritiesCos = this.getAverageSimilarity(
         groupEmbedding,
         titleEmbeddings.slice(anchorTabsPrep.length)
       );
     }
 
-    // calculate the similarity between the anchors and candidate titles
-    const titleSimilarities = this.getAverageSimilarity(
+    // s_tt_max: max cosine(anchor_title, candidate_title) in [-1,1]
+    const titleSimilaritiesCos = this.getMaxSimilarity(
       titleEmbeddings.slice(0, anchorTabsPrep.length),
       titleEmbeddings.slice(anchorTabsPrep.length)
     );
 
+    // s_dd: fraction of anchors sharing the candidate's base domain
+    const domainSimilarities = this.getDomainMatchFractions(
+      anchorTabsPrep,
+      candidateTabsPrep
+    );
+
     const candidateProbabilities = this.calculateAllProbabilities(
-      groupSimilarities,
-      titleSimilarities
+      groupSimilaritiesCos,
+      titleSimilaritiesCos,
+      domainSimilarities
     );
 
-    // get proper params depending on group name availability
+    // get matching params depending on the group name availability
     const probabilityThreshold = groupEmbedding
       ? LOGISTIC_REGRESSION_PARAMS.TITLE_WITH_GROUP_NAME.THRESHOLD
       : LOGISTIC_REGRESSION_PARAMS.TITLE_ONLY.THRESHOLD;
diff --git a/browser/components/tabbrowser/test/xpcshell/smarttabgrouping/test_calculate_probability.js b/browser/components/tabbrowser/test/xpcshell/smarttabgrouping/test_calculate_probability.js
@@ -10,13 +10,16 @@ add_task(function test_calculate_probability_zero_inputs() {
   const params = {
     GROUP_SIMILARITY_WEIGHT: 1,
     TITLE_SIMILARITY_WEIGHT: 1,
+    DOMAIN_SIMILARITY_WEIGHT: 1,
     INTERCEPT: 0,
   };
   const groupSim = 0;
   const titleSim = 0;
+  const domainSim = 0;
   const result = smartTabGroupingManager.calculateProbability(
     groupSim,
     titleSim,
+    domainSim,
     params
   );
   const expected = 1 / (1 + Math.exp(0)); // sigmoid(0) = 0.5
@@ -32,16 +35,19 @@ add_task(function test_calculate_probability_both_positive() {
   const params = {
     GROUP_SIMILARITY_WEIGHT: 1,
     TITLE_SIMILARITY_WEIGHT: 1,
+    DOMAIN_SIMILARITY_WEIGHT: 1,
     INTERCEPT: 0,
   };
   const groupSim = 1;
   const titleSim = 1;
+  const domainSim = 1;
   const result = smartTabGroupingManager.calculateProbability(
     groupSim,
     titleSim,
+    domainSim,
     params
   );
-  const expected = 1 / (1 + Math.exp(-2));
+  const expected = 1 / (1 + Math.exp(-3));
   Assert.equal(
     result.toPrecision(4),
     expected.toPrecision(4),
@@ -54,13 +60,16 @@ add_task(function test_calculate_probability_mixed_values() {
   const params = {
     GROUP_SIMILARITY_WEIGHT: 2,
     TITLE_SIMILARITY_WEIGHT: 3,
+    DOMAIN_SIMILARITY_WEIGHT: 0,
     INTERCEPT: 0.5,
   };
   const groupSim = 1;
   const titleSim = -1;
+  const domainSim = -1;
   const result = smartTabGroupingManager.calculateProbability(
     groupSim,
     titleSim,
+    domainSim,
     params
   );
   const expected = 1 / (1 + Math.exp(0.5)); // sigmoid(-0.5)
@@ -76,13 +85,16 @@ add_task(function test_calculate_probability_zero_weights() {
   const params = {
     GROUP_SIMILARITY_WEIGHT: 0,
     TITLE_SIMILARITY_WEIGHT: 0,
+    DOMAIN_SIMILARITY_WEIGHT: 0,
     INTERCEPT: 0,
   };
   const groupSim = 5;
   const titleSim = -3;
+  const domainSim = 1;
   const result = smartTabGroupingManager.calculateProbability(
     groupSim,
     titleSim,
+    domainSim,
     params
   );
   const expected = 1 / (1 + Math.exp(0)); // sigmoid(0) = 0.5
@@ -98,16 +110,19 @@ add_task(function test_calculate_probability_extreme_positive() {
   const params = {
     GROUP_SIMILARITY_WEIGHT: 1,
     TITLE_SIMILARITY_WEIGHT: 1,
+    DOMAIN_SIMILARITY_WEIGHT: 1,
     INTERCEPT: 0,
   };
   const groupSim = 10;
   const titleSim = 10;
+  const domainSim = 10;
   const result = smartTabGroupingManager.calculateProbability(
     groupSim,
     titleSim,
+    domainSim,
     params
   );
-  const expected = 1 / (1 + Math.exp(-20));
+  const expected = 1 / (1 + Math.exp(-30));
   Assert.equal(
     result.toPrecision(4),
     expected.toPrecision(4),
@@ -120,16 +135,19 @@ add_task(function test_calculate_probability_extreme_negative() {
   const params = {
     GROUP_SIMILARITY_WEIGHT: 1,
     TITLE_SIMILARITY_WEIGHT: 1,
+    DOMAIN_SIMILARITY_WEIGHT: 1,
     INTERCEPT: 0,
   };
   const groupSim = -10;
   const titleSim = -10;
+  const domainSim = -10;
   const result = smartTabGroupingManager.calculateProbability(
     groupSim,
     titleSim,
+    domainSim,
     params
   );
-  const expected = 1 / (1 + Math.exp(20));
+  const expected = 1 / (1 + Math.exp(30));
   Assert.equal(
     result.toPrecision(4),
     expected.toPrecision(4),
@@ -142,13 +160,16 @@ add_task(function test_calculate_probability_negative_intercept() {
   const params = {
     GROUP_SIMILARITY_WEIGHT: 1,
     TITLE_SIMILARITY_WEIGHT: 1,
+    DOMAIN_SIMILARITY_WEIGHT: 0,
     INTERCEPT: -1,
   };
   const groupSim = 0.5;
   const titleSim = 0.5;
+  const domainSim = 0.5;
   const result = smartTabGroupingManager.calculateProbability(
     groupSim,
     titleSim,
+    domainSim,
     params
   );
   const expected = 1 / (1 + Math.exp(0)); // sigmoid(0) = 0.5
diff --git a/browser/components/tabbrowser/test/xpcshell/smarttabgrouping/test_logistic_regression_utils.js b/browser/components/tabbrowser/test/xpcshell/smarttabgrouping/test_logistic_regression_utils.js
@@ -0,0 +1,225 @@
+/* Any copyright is dedicated to the Public Domain.
+ * http://creativecommons.org/publicdomain/zero/1.0/ */
+
+const { SmartTabGroupingManager } = ChromeUtils.importESModule(
+  "moz-src:///browser/components/tabbrowser/SmartTabGrouping.sys.mjs"
+);
+
+add_task(function test_logistic_regression_get_base_domain() {
+  // Basic HTTPS URL with www
+  Assert.equal(
+    SmartTabGroupingManager.getBaseDomain("https://www.example.com/path"),
+    "example.com",
+    "www.example.com should normalize to example.com"
+  );
+
+  // Multiple subdomains
+  Assert.equal(
+    SmartTabGroupingManager.getBaseDomain("https://docs.example.com"),
+    "docs.example.com",
+    "Should keep last subdomain + baseDomain"
+  );
+
+  // Hosted services like blogs
+  Assert.equal(
+    SmartTabGroupingManager.getBaseDomain("https://myblog.example.com/"),
+    "myblog.example.com",
+    "Should bucket per hosted subdomain (blog, docs, etc.)"
+  );
+
+  // Host without dots
+  Assert.equal(
+    SmartTabGroupingManager.getBaseDomain("http://localhost"),
+    "localhost",
+    "Should return hostname as-is when there is no dot"
+  );
+
+  // Invalid / empty URL should be handled gracefully
+  Assert.equal(
+    SmartTabGroupingManager.getBaseDomain(""),
+    "",
+    "Invalid URL should return empty string"
+  );
+});
+
+add_task(function test_logistic_regression_domain_match_fractions() {
+  const mgr = new SmartTabGroupingManager();
+
+  const anchors = [
+    { url: "https://a.com/foo" },
+    { url: "https://www.a.com/bar" },
+    { url: "https://b.com/baz" },
+  ];
+  const candidates = [
+    { url: "https://a.com/other" }, // matches 2 of 3 anchors
+    { url: "https://b.com/other" }, // matches 1 of 3 anchors
+    { url: "https://c.com/other" }, // matches 0 of 3 anchors
+    { url: "" }, // invalid / empty URL
+  ];
+
+  const fractions = mgr.getDomainMatchFractions(anchors, candidates);
+
+  Assert.equal(
+    fractions.length,
+    candidates.length,
+    "Should return one value per candidate"
+  );
+
+  Assert.less(
+    Math.abs(fractions[0] - 2 / 3),
+    1e-6,
+    "Candidate with domain matching two of three anchors should have fraction 2/3"
+  );
+
+  Assert.less(
+    Math.abs(fractions[1] - 1 / 3),
+    1e-6,
+    "Candidate with domain matching one of three anchors should have fraction 1/3"
+  );
+
+  Assert.equal(
+    fractions[2],
+    0,
+    "Candidate with domain not matching any anchor should have fraction 0"
+  );
+
+  Assert.equal(
+    fractions[3],
+    0,
+    "Candidate with invalid URL should have fraction 0"
+  );
+});
+
+add_task(function test_logistic_regression_get_max_similarity() {
+  const mgr = new SmartTabGroupingManager();
+
+  const anchors = [
+    [1, 0],
+    [0, 1],
+  ];
+  const candidates = [
+    [1, 0], // identical to first anchor -> cos ~ 1
+    [0.5, 0.5], // at 45 degrees -> cos ~ 0.707 with either anchor
+  ];
+
+  const maxSims = mgr.getMaxSimilarity(anchors, candidates);
+
+  Assert.equal(
+    maxSims.length,
+    candidates.length,
+    "Should return one max similarity per candidate"
+  );
+
+  Assert.less(
+    Math.abs(maxSims[0] - 1),
+    1e-6,
+    "First candidate identical to first anchor should have cosine similarity ~1"
+  );
+
+  Assert.ok(
+    maxSims[1] > 0.7 && maxSims[1] < 0.8,
+    "Second candidate should have cosine similarity ~sqrt(1/2) ≈ 0.707 with at least one anchor"
+  );
+});
+
+add_task(function test_logistic_regression_sigmoid_and_calculate_probability() {
+  const mgr = new SmartTabGroupingManager();
+
+  // Basic sigmoid sanity checks
+  Assert.less(Math.abs(mgr.sigmoid(0) - 0.5), 1e-6, "sigmoid(0) should be 0.5");
+
+  Assert.greater(
+    mgr.sigmoid(10),
+    0.99,
+    "sigmoid of large positive number should be close to 1"
+  );
+
+  Assert.less(
+    mgr.sigmoid(-10),
+    0.01,
+    "sigmoid of large negative number should be close to 0"
+  );
+
+  // Check that calculateProbability matches explicit linear combination + sigmoid
+  const params = {
+    GROUP_SIMILARITY_WEIGHT: 1,
+    TITLE_SIMILARITY_WEIGHT: 2,
+    DOMAIN_SIMILARITY_WEIGHT: 3,
+    INTERCEPT: 0,
+  };
+
+  const s_gc = 0.5;
+  const s_tt = 0.5;
+  const s_dd = 0.5;
+
+  const prob = mgr.calculateProbability(s_gc, s_tt, s_dd, params);
+  const expectedZ = s_gc * 1 + s_tt * 2 + s_dd * 3; // 3
+  const expectedProb = mgr.sigmoid(expectedZ);
+
+  Assert.less(
+    Math.abs(prob - expectedProb),
+    1e-6,
+    "calculateProbability should equal sigmoid(linear combination of features and weights)"
+  );
+});
+
+add_task(
+  function test_logistic_regression_calculate_all_probabilities_with_group() {
+    const mgr = new SmartTabGroupingManager();
+
+    // cos = 0 for both candidates -> s_gc = s_tt_max = 0.5 for both
+    const groupSimilaritiesCos = [0, 0];
+    const titleSimilaritiesCos = [0, 0];
+
+    // Candidate 0 has full domain match, candidate 1 has none.
+    const domainSimilarities = [1, 0];
+
+    const probs = mgr.calculateAllProbabilities(
+      groupSimilaritiesCos,
+      titleSimilaritiesCos,
+      domainSimilarities
+    );
+
+    Assert.equal(
+      probs.length,
+      2,
+      "Should return one probability per candidate"
+    );
+
+    Assert.greater(
+      probs[0],
+      probs[1],
+      "With group present, candidate with higher domain match fraction should have higher probability"
+    );
+  }
+);
+
+add_task(
+  function test_logistic_regression_calculate_all_probabilities_without_group() {
+    const mgr = new SmartTabGroupingManager();
+
+    // cos = 0 for both candidates -> s_tt_max = 0.5 for both
+    const titleSimilaritiesCos = [0, 0];
+
+    // Candidate 0 has full domain match, candidate 1 has none.
+    const domainSimilarities = [1, 0];
+
+    const probs = mgr.calculateAllProbabilities(
+      null, // no group similarities -> TITLE_ONLY params
+      titleSimilaritiesCos,
+      domainSimilarities
+    );
+
+    Assert.equal(
+      probs.length,
+      2,
+      "Should return one probability per candidate"
+    );
+
+    Assert.greater(
+      probs[0],
+      probs[1],
+      "Without group, candidate with higher domain match fraction should have higher probability"
+    );
+  }
+);
diff --git a/browser/components/tabbrowser/test/xpcshell/smarttabgrouping/xpcshell.toml b/browser/components/tabbrowser/test/xpcshell/smarttabgrouping/xpcshell.toml
@@ -8,4 +8,6 @@ firefox-appdir = "browser"
 
 ["test_calculate_probability.js"]
 
+["test_logistic_regression_utils.js"]
+
 ["test_text_preprocessing.js"]
diff --git a/toolkit/components/ml/tests/browser/browser_ml_smart_tab_clustering_perf.js b/toolkit/components/ml/tests/browser/browser_ml_smart_tab_clustering_perf.js
@@ -111,10 +111,27 @@ async function runTopicModel(texts, keywords = []) {
   return output.map(o => o.generated_text);
 }
 
+// build tab object similar to what we'd expect for an actual tab
+function makeUrlTab(url, label, { groupId = null } = {}) {
+  return {
+    label,
+    url,
+    group: groupId,
+    pinned: false,
+    linkedBrowser: {
+      currentURI: {
+        spec: url,
+      },
+    },
+  };
+}
+
 const singleTabMetrics = {};
 singleTabMetrics["SINGLE-TAB-LATENCY"] = [];
 singleTabMetrics["SINGLE-TAB-LOGISTIC-REGRESSION-LATENCY"] = [];
 singleTabMetrics["SINGLE-TAB-TOPIC-LATENCY"] = [];
+// measure latency with domain feature
+singleTabMetrics["SINGLE-TAB-LR-WITH-DOMAIN-LATENCY"] = [];
 
 add_task(async function test_clustering_nearest_neighbors() {
   const modelHubRootUrl = Services.env.get("MOZ_MODELS_HUB");
@@ -142,7 +159,7 @@ add_task(async function test_clustering_nearest_neighbors() {
     groupedIndices: [1],
     alreadyGroupedIndices: [],
     groupLabel: "Travel Planning",
-    thresholdMills: 300,
+    thresholdMills: 275,
   });
   const endTime = performance.now();
   singleTabMetrics["SINGLE-TAB-LATENCY"].push(endTime - startTime);
@@ -205,7 +222,7 @@ add_task(async function test_clustering_logistic_regression() {
   const titles = similarTabs.map(s => s.label);
   Assert.equal(
     titles.length,
-    5,
+    3,
     "Proper number of similar tabs should be returned"
   );
   Assert.equal(
@@ -217,11 +234,193 @@ add_task(async function test_clustering_logistic_regression() {
     "Impact of Tourism on Local Communities - Google Scholar"
   );
   Assert.equal(titles[2], "Cheap Flights, Airline Tickets & Airfare Deals");
-  Assert.equal(
-    titles[3],
-    "The Influence of Travel Restrictions on the Spread of COVID-19 - Nature"
+  generateEmbeddingsStub.restore();
+  await EngineProcess.destroyMLEngine();
+  await cleanup();
+});
+
+// test domain feature for Logistic Regression
+add_task(
+  async function test_clustering_logistic_regression_domain_preference() {
+    const modelHubRootUrl = Services.env.get("MOZ_MODELS_HUB");
+    const { cleanup } = await perfSetup({
+      prefs: [["browser.ml.modelHubRootUrl", modelHubRootUrl]],
+    });
+
+    const stgManager = new SmartTabGroupingManager();
+
+    let generateEmbeddingsStub = sinon.stub(
+      SmartTabGroupingManager.prototype,
+      "_generateEmbeddings"
+    );
+    generateEmbeddingsStub.callsFake(async textList => {
+      return await generateEmbeddings(textList);
+    });
+
+    const sharedTitle = "Smart Tab Grouping deep dive";
+
+    const anchor0 = makeUrlTab(
+      "https://docs.google.com/document/d/1-smart-tab-grouping-deep-dive/edit",
+      sharedTitle,
+      { groupId: "stg-group" }
+    );
+    const anchor1 = makeUrlTab(
+      "https://docs.google.com/document/d/1-smart-tab-grouping-deep-dive-2/edit",
+      sharedTitle,
+      { groupId: "stg-group" }
+    );
+
+    const candidateSameDomain = makeUrlTab(
+      "https://docs.google.com/document/d/1-smart-tab-grouping-deep-dive-3/edit",
+      sharedTitle
+    );
+    const candidateOtherDomain = makeUrlTab(
+      "https://example.com/smart-tab-grouping-deep-dive-3",
+      sharedTitle
+    );
+
+    const unrelated = makeUrlTab(
+      "https://www.youtube.com/watch?v=xyz",
+      "Cute cat compilation 2025"
+    );
+
+    const allTabs = [
+      anchor0,
+      anchor1,
+      candidateSameDomain,
+      candidateOtherDomain,
+      unrelated,
+    ];
+
+    const groupedIndices = [0, 1];
+    const alreadyGroupedIndices = [];
+    const groupLabel = sharedTitle;
+
+    const startTime = performance.now();
+    const similarTabs = await stgManager.findSimilarTabsLogisticRegression({
+      allTabs,
+      groupedIndices,
+      alreadyGroupedIndices,
+      groupLabel,
+    });
+    const endTime = performance.now();
+
+    singleTabMetrics["SINGLE-TAB-LR-WITH-DOMAIN-LATENCY"].push(
+      endTime - startTime
+    );
+
+    Assert.greaterOrEqual(
+      similarTabs.length,
+      1,
+      "Logistic regression with domain should return at least one candidate"
+    );
+
+    const first = similarTabs[0];
+
+    Assert.equal(
+      first.linkedBrowser.currentURI.spec,
+      candidateSameDomain.linkedBrowser.currentURI.spec,
+      "Candidate sharing the anchors' base domain should be ranked first when text and group label match"
+    );
+
+    const titles = similarTabs.map(t => t.label);
+    Assert.ok(
+      !titles.includes("Cute cat compilation 2025"),
+      "An obviously unrelated tab should not be selected"
+    );
+
+    generateEmbeddingsStub.restore();
+    await EngineProcess.destroyMLEngine();
+    await cleanup();
+  }
+);
+
+/// test a trickier example with subdomains
+add_task(async function test_clustering_nn_vs_lr_realistic_example() {
+  const modelHubRootUrl = Services.env.get("MOZ_MODELS_HUB");
+  const { cleanup } = await perfSetup({
+    prefs: [["browser.ml.modelHubRootUrl", modelHubRootUrl]],
+  });
+
+  const stgManager = new SmartTabGroupingManager();
+
+  let generateEmbeddingsStub = sinon.stub(
+    SmartTabGroupingManager.prototype,
+    "_generateEmbeddings"
+  );
+  generateEmbeddingsStub.callsFake(async textList => {
+    return await generateEmbeddings(textList);
+  });
+
+  const anchor0 = makeUrlTab(
+    "https://docs.google.com/document/d/1-smart-tab-grouping-design/edit",
+    "Smart Tab Grouping – design document",
+    { groupId: "stg-group" }
   );
-  Assert.equal(titles[4], "Hotel Deals: Save Big on Hotels with Expedia");
+  const anchor1 = makeUrlTab(
+    "https://docs.google.com/document/d/1-smart-tab-grouping-logistic-regression/edit",
+    "Smart Tab Grouping – logistic regression model notes",
+    { groupId: "stg-group" }
+  );
+
+  const candGithub = makeUrlTab(
+    "https://github.com/mozilla-mobile/firefox-android/issues/999999",
+    "Smart Tab Grouping: tune logistic regression thresholds for mobile"
+  );
+  const candMdn = makeUrlTab(
+    "https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/map",
+    "Array.prototype.map() – JavaScript | MDN"
+  );
+  const candNba = makeUrlTab(
+    "https://www.espn.com/nba/scoreboard",
+    "NBA scoreboard – live scores and results"
+  );
+  const candRecipe = makeUrlTab(
+    "https://www.seriouseats.com/best-lasagna-recipe",
+    "The very best lasagna recipe"
+  );
+
+  const allTabs = [anchor0, anchor1, candGithub, candMdn, candNba, candRecipe];
+
+  const groupedIndices = [0, 1];
+  const alreadyGroupedIndices = [];
+  const groupLabel = "Smart Tab Grouping";
+
+  // Nearest neighbors
+  const nnTabs = await stgManager.findNearestNeighbors({
+    allTabs,
+    groupedIndices,
+    alreadyGroupedIndices,
+    groupLabel,
+    thresholdMills: 275,
+  });
+
+  Assert.greaterOrEqual(
+    nnTabs.length,
+    1,
+    "Nearest neighbors should return at least one candidate in the realistic example"
+  );
+
+  // run LR
+  const lrTabs = await stgManager.findSimilarTabsLogisticRegression({
+    allTabs,
+    groupedIndices,
+    alreadyGroupedIndices,
+    groupLabel,
+  });
+
+  Assert.greaterOrEqual(
+    lrTabs.length,
+    1,
+    "Logistic regression should return at least one candidate in the realistic example"
+  );
+
+  const lrTitles = lrTabs.map(t => t.label);
+  Assert.ok(
+    !lrTitles.includes("The very best lasagna recipe"),
+    "Logistic regression should not select a totally unrelated lasagna recipe tab"
+  );
+
   generateEmbeddingsStub.restore();
   await EngineProcess.destroyMLEngine();
   await cleanup();

	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE

M	browser/components/tabbrowser/SmartTabGrouping.sys.mjs	\|	213	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
M	browser/components/tabbrowser/test/xpcshell/smarttabgrouping/test_calculate_probability.js	\|	27	++++++++++++++++++++++++---
A	browser/components/tabbrowser/test/xpcshell/smarttabgrouping/test_logistic_regression_utils.js	\|	225	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	browser/components/tabbrowser/test/xpcshell/smarttabgrouping/xpcshell.toml	\|	2	++
M	toolkit/components/ml/tests/browser/browser_ml_smart_tab_clustering_perf.js	\|	211	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---