commit bcbd638c2a3b0f267f98c39e8ced0306a9ebf51f
parent 318eeb33b1fb1990bba29e8c3bcb182dc1980747
Author: Erik Nordin <enordin@mozilla.com>
Date: Wed, 8 Oct 2025 19:28:55 +0000
Bug 1967758 - Remove LanguageDetector.detectLanguageFromDocument r=translations-reviewers,gregtatum
This patch removes the `LanguageDetector.detectLanguageFromDocument` function,
since it should not longer be used. Developers should instead utilize the
PageExtractor capability to extract text from the DOM, and then run
`LanguageDetector.detectLanguage` on the extracted text.
Differential Revision: https://phabricator.services.mozilla.com/D267653
Diffstat:
1 file changed, 2 insertions(+), 74 deletions(-)
diff --git a/toolkit/components/translations/LanguageDetector.sys.mjs b/toolkit/components/translations/LanguageDetector.sys.mjs
@@ -11,54 +11,6 @@ const WORKER_URL = "resource://gre/modules/translations/cld-worker.js";
*/
/**
- * The options used for when detecting a language.
- *
- * @typedef {object} DetectionOptions
- *
- * @property {string} text - The text to analyze.
- * @property {boolean} [isHTML] - A boolean, indicating whether the text should be analyzed as
- * HTML rather than plain text.
- * @property {string} [language] - A string indicating the expected language. For text
- * extracted from HTTP documents, this is expected to come from the Content-Language
- * header.
- * @property {string} [tld] - A string indicating the top-level domain of the document the
- * text was extracted from.
- * @property {string} [encoding] - A string describing the encoding of the document the
- * string was extracted from. Note that, regardless of the value of this property,
- * the 'text' property must be a UTF-16 JavaScript string.
- */
-
-/**
- * The length of the substring to pull from the document's text for language
- * identification.
- *
- * This value should ideally be one that is large enough to yield a confident
- * identification result without being too large or expensive to extract.
- *
- * At this time, this value is not driven by statistical data or analysis.
- */
-const DOC_TEXT_TO_IDENTIFY_LENGTH = 1024;
-
-/**
- * The shorter the text, the less confidence we should have in the result of the language
- * identification. Add another heuristic to report the ID as not confident if the length
- * of the code points of the text is less than this threshold.
- *
- * This was determined by plotting a kernel density estimation of the number of times the
- * source language had to be changed in the SelectTranslationsPanel vs. the code units in
- * the source text.
- *
- * 0013 code units or less - 49.5% of language changes
- * 0036 code units or less - 74.9% of language changes
- * 0153 code units or less - 90.0% of language changes
- * 0200 code units or less - 91.5% of language changes
- * 0427 code units or less - 95.0% of language changes
- * 1382 code units or less - 98.0% of language changes
- * 3506 code units or less - 99.0% of language changes
- */
-const DOC_CONFIDENCE_THRESHOLD = 200;
-
-/**
* An internal class to manage communicating to the worker, and managing its lifecycle.
* It's initialized once below statically to the module.
*/
@@ -184,7 +136,8 @@ class WorkerManager {
export const workerManager = new WorkerManager();
/**
- *
+ * This class provides the ability to identify the language of text using
+ * the CLD2 language-detection algorithm.
*/
export class LanguageDetector {
/**
@@ -201,29 +154,4 @@ export class LanguageDetector {
return workerManager.detectLanguage(options);
}
-
- /**
- * Attempts to determine the language in which the document's content is written.
- *
- * @param {Document} document
- * @returns {Promise<DetectionResult>}
- */
- static async detectLanguageFromDocument(document) {
- // Grab a selection of text.
- let encoder = Cu.createDocumentEncoder("text/plain");
- encoder.init(document, "text/plain", encoder.SkipInvisibleContent);
- let text = encoder
- .encodeToStringWithMaxLength(DOC_TEXT_TO_IDENTIFY_LENGTH)
- .replaceAll("\r", "")
- .replaceAll("\n", " ");
-
- const result = await workerManager.detectLanguage({
- text,
- });
-
- if (text.length < DOC_CONFIDENCE_THRESHOLD) {
- result.confident = false;
- }
- return result;
- }
}