[ tor-browser ].git.dasho

commit a72f8f31ab53f952c0ae63a4df0dd1ecc88f9b2a
parent 198fa84fad7afc56fa617408cc5bb385e42eb792
Author: Erik Nordin <enordin@mozilla.com>
Date:   Sat, 18 Oct 2025 02:16:50 +0000

Bug 1967758 - Remove LanguageDetector.detectLanguageFromDocument r=translations-reviewers,gregtatum

This patch removes the `LanguageDetector.detectLanguageFromDocument` function,
since it should not longer be used. Developers should instead utilize the
PageExtractor capability to extract text from the DOM, and then run
`LanguageDetector.detectLanguage` on the extracted text.

Differential Revision: https://phabricator.services.mozilla.com/D267653

Diffstat:
M toolkit/components/translations/LanguageDetector.sys.mjs  | 76 ++--------------------------------------------------------------------------

1 file changed, 2 insertions(+), 74 deletions(-)
diff --git a/toolkit/components/translations/LanguageDetector.sys.mjs b/toolkit/components/translations/LanguageDetector.sys.mjs
@@ -11,54 +11,6 @@ const WORKER_URL = "resource://gre/modules/translations/cld-worker.js";
  */
 
 /**
- * The options used for when detecting a language.
- *
- * @typedef {object} DetectionOptions
- *
- * @property {string} text - The text to analyze.
- * @property {boolean} [isHTML] - A boolean, indicating whether the text should be analyzed as
- *     HTML rather than plain text.
- * @property {string} [language] - A string indicating the expected language. For text
- *     extracted from HTTP documents, this is expected to come from the Content-Language
- *     header.
- * @property {string} [tld] - A string indicating the top-level domain of the document the
- *     text was extracted from.
- * @property {string} [encoding] - A string describing the encoding of the document the
- *     string was extracted from. Note that, regardless of the value of this property,
- *     the 'text' property must be a UTF-16 JavaScript string.
- */
-
-/**
- * The length of the substring to pull from the document's text for language
- * identification.
- *
- * This value should ideally be one that is large enough to yield a confident
- * identification result without being too large or expensive to extract.
- *
- * At this time, this value is not driven by statistical data or analysis.
- */
-const DOC_TEXT_TO_IDENTIFY_LENGTH = 1024;
-
-/**
- * The shorter the text, the less confidence we should have in the result of the language
- * identification. Add another heuristic to report the ID as not confident if the length
- * of the code points of the text is less than this threshold.
- *
- * This was determined by plotting a kernel density estimation of the number of times the
- * source language had to be changed in the SelectTranslationsPanel vs. the code units in
- * the source text.
- *
- * 0013 code units or less - 49.5% of language changes
- * 0036 code units or less - 74.9% of language changes
- * 0153 code units or less - 90.0% of language changes
- * 0200 code units or less - 91.5% of language changes
- * 0427 code units or less - 95.0% of language changes
- * 1382 code units or less - 98.0% of language changes
- * 3506 code units or less - 99.0% of language changes
- */
-const DOC_CONFIDENCE_THRESHOLD = 200;
-
-/**
  * An internal class to manage communicating to the worker, and managing its lifecycle.
  * It's initialized once below statically to the module.
  */
@@ -184,7 +136,8 @@ class WorkerManager {
 export const workerManager = new WorkerManager();
 
 /**
- *
+ * This class provides the ability to identify the language of text using
+ * the CLD2 language-detection algorithm.
  */
 export class LanguageDetector {
   /**
@@ -201,29 +154,4 @@ export class LanguageDetector {
 
     return workerManager.detectLanguage(options);
   }
-
-  /**
-   * Attempts to determine the language in which the document's content is written.
-   *
-   * @param {Document} document
-   * @returns {Promise<DetectionResult>}
-   */
-  static async detectLanguageFromDocument(document) {
-    // Grab a selection of text.
-    let encoder = Cu.createDocumentEncoder("text/plain");
-    encoder.init(document, "text/plain", encoder.SkipInvisibleContent);
-    let text = encoder
-      .encodeToStringWithMaxLength(DOC_TEXT_TO_IDENTIFY_LENGTH)
-      .replaceAll("\r", "")
-      .replaceAll("\n", " ");
-
-    const result = await workerManager.detectLanguage({
-      text,
-    });
-
-    if (text.length < DOC_CONFIDENCE_THRESHOLD) {
-      result.confident = false;
-    }
-    return result;
-  }
 }

	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE