commit 2db2ad618c5d8148357fdd8d720f8c2225dfdc79
parent 77d5e074eecb55cdb3d568a284f65913add22ee2
Author: Cristina Horotan <chorotan@mozilla.com>
Date: Thu, 9 Oct 2025 01:19:36 +0300
Revert "Bug 1967758 - Utilize PageExtractor for Translations language detection r=translations-reviewers,gregtatum" on request
This reverts commit 421732532b12b44076f9e7b894c7d83e728c0f74.
Revert "Bug 1967758 - Remove unused identifiedLangTag entry r=translations-reviewers,gregtatum"
This reverts commit bb6366e8e23f342a0750bb300f97feeecb8be2cc.
Revert "Bug 1967758 - Remove LanguageDetector.detectLanguageFromDocument r=translations-reviewers,gregtatum"
This reverts commit bcbd638c2a3b0f267f98c39e8ced0306a9ebf51f.
Diffstat:
4 files changed, 116 insertions(+), 120 deletions(-)
diff --git a/browser/components/translations/tests/browser/browser_translations_full_page_language_id_behavior.js b/browser/components/translations/tests/browser/browser_translations_full_page_language_id_behavior.js
@@ -296,57 +296,3 @@ add_task(async function test_language_identification_behavior() {
await cleanup();
}
});
-
-/**
- * This test case tests the behavior when the page has no declared language
- * tag and the detected language is not supported by Translations.
- */
-add_task(async function test_detected_language_unsupported() {
- info("Testing unsupported detected language with no declared language");
- TranslationsParent.testAutomaticPopup = true;
-
- let wasPopupShown = false;
- window.FullPageTranslationsPanel.elements; // De-lazify the panel.
-
- const { resolve } = Promise.withResolvers();
- const panel = window.document.getElementById("full-page-translations-panel");
-
- function handlePopupShown() {
- wasPopupShown = true;
- panel.removeEventListener("popupshown", handlePopupShown);
- resolve();
- }
- panel.addEventListener("popupshown", handlePopupShown);
-
- const { cleanup, runInPage } = await loadTestPage({
- page: SPANISH_PAGE_UNDECLARED_URL,
- // Deliberately omit Spanish so that it is not supported.
- languagePairs: [
- { fromLang: "en", toLang: "fr" },
- { fromLang: "fr", toLang: "en" },
- { fromLang: "en", toLang: "uk" },
- { fromLang: "uk", toLang: "en" },
- ],
- autoDownloadFromRemoteSettings: true,
- contentEagerMode: true,
- });
-
- await FullPageTranslationsTestUtils.assertTranslationsButton(
- { button: false },
- "The translations button is not visible when the detected language is unsupported."
- );
-
- await FullPageTranslationsTestUtils.assertPageIsNotTranslated(
- runInPage,
- "No translation should occur when the detected language is unsupported."
- );
-
- is(
- wasPopupShown,
- false,
- "A translation was not offered for an unsupported detected language."
- );
-
- TranslationsParent.testAutomaticPopup = false;
- await cleanup();
-});
diff --git a/toolkit/components/translations/LanguageDetector.sys.mjs b/toolkit/components/translations/LanguageDetector.sys.mjs
@@ -11,6 +11,54 @@ const WORKER_URL = "resource://gre/modules/translations/cld-worker.js";
*/
/**
+ * The options used for when detecting a language.
+ *
+ * @typedef {object} DetectionOptions
+ *
+ * @property {string} text - The text to analyze.
+ * @property {boolean} [isHTML] - A boolean, indicating whether the text should be analyzed as
+ * HTML rather than plain text.
+ * @property {string} [language] - A string indicating the expected language. For text
+ * extracted from HTTP documents, this is expected to come from the Content-Language
+ * header.
+ * @property {string} [tld] - A string indicating the top-level domain of the document the
+ * text was extracted from.
+ * @property {string} [encoding] - A string describing the encoding of the document the
+ * string was extracted from. Note that, regardless of the value of this property,
+ * the 'text' property must be a UTF-16 JavaScript string.
+ */
+
+/**
+ * The length of the substring to pull from the document's text for language
+ * identification.
+ *
+ * This value should ideally be one that is large enough to yield a confident
+ * identification result without being too large or expensive to extract.
+ *
+ * At this time, this value is not driven by statistical data or analysis.
+ */
+const DOC_TEXT_TO_IDENTIFY_LENGTH = 1024;
+
+/**
+ * The shorter the text, the less confidence we should have in the result of the language
+ * identification. Add another heuristic to report the ID as not confident if the length
+ * of the code points of the text is less than this threshold.
+ *
+ * This was determined by plotting a kernel density estimation of the number of times the
+ * source language had to be changed in the SelectTranslationsPanel vs. the code units in
+ * the source text.
+ *
+ * 0013 code units or less - 49.5% of language changes
+ * 0036 code units or less - 74.9% of language changes
+ * 0153 code units or less - 90.0% of language changes
+ * 0200 code units or less - 91.5% of language changes
+ * 0427 code units or less - 95.0% of language changes
+ * 1382 code units or less - 98.0% of language changes
+ * 3506 code units or less - 99.0% of language changes
+ */
+const DOC_CONFIDENCE_THRESHOLD = 200;
+
+/**
* An internal class to manage communicating to the worker, and managing its lifecycle.
* It's initialized once below statically to the module.
*/
@@ -136,8 +184,7 @@ class WorkerManager {
export const workerManager = new WorkerManager();
/**
- * This class provides the ability to identify the language of text using
- * the CLD2 language-detection algorithm.
+ *
*/
export class LanguageDetector {
/**
@@ -154,4 +201,29 @@ export class LanguageDetector {
return workerManager.detectLanguage(options);
}
+
+ /**
+ * Attempts to determine the language in which the document's content is written.
+ *
+ * @param {Document} document
+ * @returns {Promise<DetectionResult>}
+ */
+ static async detectLanguageFromDocument(document) {
+ // Grab a selection of text.
+ let encoder = Cu.createDocumentEncoder("text/plain");
+ encoder.init(document, "text/plain", encoder.SkipInvisibleContent);
+ let text = encoder
+ .encodeToStringWithMaxLength(DOC_TEXT_TO_IDENTIFY_LENGTH)
+ .replaceAll("\r", "")
+ .replaceAll("\n", " ");
+
+ const result = await workerManager.detectLanguage({
+ text,
+ });
+
+ if (text.length < DOC_CONFIDENCE_THRESHOLD) {
+ result.confident = false;
+ }
+ return result;
+ }
}
diff --git a/toolkit/components/translations/actors/TranslationsChild.sys.mjs b/toolkit/components/translations/actors/TranslationsChild.sys.mjs
@@ -8,6 +8,8 @@ ChromeUtils.defineESModuleGetters(lazy, {
"chrome://global/content/translations/translations-document.sys.mjs",
LRUCache:
"chrome://global/content/translations/translations-document.sys.mjs",
+ LanguageDetector:
+ "resource://gre/modules/translations/LanguageDetector.sys.mjs",
});
/**
@@ -115,6 +117,33 @@ export class TranslationsChild extends JSWindowActorChild {
case "Translations:GetDocumentElementLang": {
return this.document.documentElement.lang;
}
+ case "Translations:IdentifyLanguage": {
+ // Wait for idle callback as the page will be more settled if it has
+ // dynamic content, like on a React app.
+ if (this.contentWindow) {
+ await new Promise(resolve => {
+ this.contentWindow.requestIdleCallback(resolve);
+ });
+ }
+
+ if (this.#isDestroyed) {
+ return undefined;
+ }
+
+ const startTime = ChromeUtils.now();
+ const detectionResult =
+ await lazy.LanguageDetector.detectLanguageFromDocument(this.document);
+
+ if (this.#isDestroyed) {
+ return undefined;
+ }
+
+ this.addProfilerMarker(
+ `Detect language from document: ${detectionResult.language}`,
+ startTime
+ );
+ return detectionResult;
+ }
case "Translations:AcquirePort": {
this.addProfilerMarker("Acquired a port, resuming translations");
this.#translatedDoc.acquirePort(data.port);
diff --git a/toolkit/components/translations/actors/TranslationsParent.sys.mjs b/toolkit/components/translations/actors/TranslationsParent.sys.mjs
@@ -81,8 +81,6 @@ XPCOMUtils.defineLazyServiceGetters(lazy, {
});
ChromeUtils.defineESModuleGetters(lazy, {
- LanguageDetector:
- "resource://gre/modules/translations/LanguageDetector.sys.mjs",
RemoteSettings: "resource://services-settings/remote-settings.sys.mjs",
setTimeout: "resource://gre/modules/Timer.sys.mjs",
TranslationsTelemetry:
@@ -447,27 +445,6 @@ export class TranslationsParent extends JSWindowActorParent {
static LANGUAGE_MODEL_MAJOR_VERSION_MAX = 3;
/**
- * The shorter the text, the less confidence we should have in the result of the language
- * identification. Add another heuristic to report the ID as not confident if the length
- * of the code points of the text is less than this threshold.
- *
- * This was determined by plotting a kernel density estimation of the number of times the
- * source language had to be changed in the SelectTranslationsPanel vs. the code units in
- * the source text.
- *
- * 0013 code units or less - 49.5% of language changes
- * 0036 code units or less - 74.9% of language changes
- * 0153 code units or less - 90.0% of language changes
- * 0200 code units or less - 91.5% of language changes
- * 0427 code units or less - 95.0% of language changes
- * 1382 code units or less - 98.0% of language changes
- * 3506 code units or less - 99.0% of language changes
- *
- * @type {number}
- */
- static #DOC_CONFIDENCE_THRESHOLD = 150;
-
- /**
* Contains the state that would affect UI. Anytime this state is changed, a dispatch
* event is sent so that UI can react to it. The actor is inside of /toolkit and
* needs a way of notifying /browser code (or other users) of when the state changes.
@@ -818,7 +795,7 @@ export class TranslationsParent extends JSWindowActorParent {
// popup will not be shown.
if (detectedLanguages.htmlLangAttribute && !detectedLanguages.identified) {
// Compare language langTagsMatch
- detectedLanguages.identified = await this.#identifyPageLanguage();
+ detectedLanguages.identified = await this.queryIdentifyLanguage();
if (
!lazy.TranslationsUtils.langTagsMatch(
@@ -867,7 +844,7 @@ export class TranslationsParent extends JSWindowActorParent {
if (
!TranslationsParent.findCompatibleSourceLangTagSync(
- detectedLanguages.identified.language,
+ detectedLanguages.identifiedLangTag,
await TranslationsParent.getNonPivotLanguagePairs()
)
) {
@@ -3493,11 +3470,9 @@ export class TranslationsParent extends JSWindowActorParent {
}
/**
- * Uses the page extractor to identify the current page's language.
- *
* @returns {Promise<DetectionResult>}
*/
- async #identifyPageLanguage() {
+ async queryIdentifyLanguage() {
if (
TranslationsParent.isInAutomation() &&
!TranslationsParent.#isTranslationsEngineMocked
@@ -3505,39 +3480,13 @@ export class TranslationsParent extends JSWindowActorParent {
// In automation assume English is the language, but don't be confident.
return { confident: false, language: "en", languages: [] };
}
-
- const actor =
- this.browsingContext?.currentWindowGlobal?.getActor("PageExtractor");
-
- if (!actor) {
- throw new Error("Unable to get the PageExtractor actor.");
- }
-
- const startTime = ChromeUtils.now();
-
- const pageText = await actor.getText();
- if (this.#isDestroyed) {
- return { language: "", confident: false, languages: [] };
- }
-
- const result = await lazy.LanguageDetector.detectLanguage(pageText);
- if (this.#isDestroyed) {
- return { language: "", confident: false, languages: [] };
- }
-
- const message = `Identified page language as "${result.language}": ${this.browsingContext?.currentURI?.spec}`;
- ChromeUtils.addProfilerMarker(
- "TranslationsParent",
- { startTime, innerWindowId: this.innerWindowId },
- message
- );
- lazy.console.debug(message);
-
- if (pageText.length < TranslationsParent.#DOC_CONFIDENCE_THRESHOLD) {
- result.confident = false;
- }
-
- return result;
+ return this.sendQuery("Translations:IdentifyLanguage").catch(error => {
+ if (this.#isDestroyed) {
+ // The actor was destroyed while this message was still being resolved.
+ return null;
+ }
+ return Promise.reject(error);
+ });
}
/**
@@ -3580,7 +3529,7 @@ export class TranslationsParent extends JSWindowActorParent {
// Do a final check that the identified language matches the reported language
// tag to ensure that the page isn't reporting the incorrect languages. This
// check is deferred to now for performance considerations.
- langTags.identified = await this.#identifyPageLanguage();
+ langTags.identified = await this.queryIdentifyLanguage();
langTags.docLangTag = langTags.identified.language;
if (
@@ -3763,7 +3712,7 @@ export class TranslationsParent extends JSWindowActorParent {
userLangTag: null,
isDocLangTagSupported: false,
htmlLangAttribute: htmlLangAttribute ?? null,
- identified: null,
+ identifiedLangTag: null,
};
/**
@@ -3814,7 +3763,7 @@ export class TranslationsParent extends JSWindowActorParent {
if (!langTags.docLangTag) {
// If the document's markup had no specified langTag, attempt to identify the
// page's language.
- langTags.identified = await this.#identifyPageLanguage();
+ langTags.identified = await this.queryIdentifyLanguage();
langTags.docLangTag = langTags.identified.language;
maybeNormalizeDocLangTag();
langTags.identified.language = langTags.docLangTag;