commit 421732532b12b44076f9e7b894c7d83e728c0f74
parent bb6366e8e23f342a0750bb300f97feeecb8be2cc
Author: Erik Nordin <enordin@mozilla.com>
Date: Wed, 8 Oct 2025 19:28:55 +0000
Bug 1967758 - Utilize PageExtractor for Translations language detection r=translations-reviewers,gregtatum
This patch updates the TranslationsParent language-identification log
for Full-Page Translations to utilize the new PageExtractor functionality,
which should improve the quality of our language detection compared to
the previous implementation.
Differential Revision: https://phabricator.services.mozilla.com/D267655
Diffstat:
2 files changed, 62 insertions(+), 40 deletions(-)
diff --git a/toolkit/components/translations/actors/TranslationsChild.sys.mjs b/toolkit/components/translations/actors/TranslationsChild.sys.mjs
@@ -8,8 +8,6 @@ ChromeUtils.defineESModuleGetters(lazy, {
"chrome://global/content/translations/translations-document.sys.mjs",
LRUCache:
"chrome://global/content/translations/translations-document.sys.mjs",
- LanguageDetector:
- "resource://gre/modules/translations/LanguageDetector.sys.mjs",
});
/**
@@ -117,33 +115,6 @@ export class TranslationsChild extends JSWindowActorChild {
case "Translations:GetDocumentElementLang": {
return this.document.documentElement.lang;
}
- case "Translations:IdentifyLanguage": {
- // Wait for idle callback as the page will be more settled if it has
- // dynamic content, like on a React app.
- if (this.contentWindow) {
- await new Promise(resolve => {
- this.contentWindow.requestIdleCallback(resolve);
- });
- }
-
- if (this.#isDestroyed) {
- return undefined;
- }
-
- const startTime = ChromeUtils.now();
- const detectionResult =
- await lazy.LanguageDetector.detectLanguageFromDocument(this.document);
-
- if (this.#isDestroyed) {
- return undefined;
- }
-
- this.addProfilerMarker(
- `Detect language from document: ${detectionResult.language}`,
- startTime
- );
- return detectionResult;
- }
case "Translations:AcquirePort": {
this.addProfilerMarker("Acquired a port, resuming translations");
this.#translatedDoc.acquirePort(data.port);
diff --git a/toolkit/components/translations/actors/TranslationsParent.sys.mjs b/toolkit/components/translations/actors/TranslationsParent.sys.mjs
@@ -81,6 +81,8 @@ XPCOMUtils.defineLazyServiceGetters(lazy, {
});
ChromeUtils.defineESModuleGetters(lazy, {
+ LanguageDetector:
+ "resource://gre/modules/translations/LanguageDetector.sys.mjs",
RemoteSettings: "resource://services-settings/remote-settings.sys.mjs",
setTimeout: "resource://gre/modules/Timer.sys.mjs",
TranslationsTelemetry:
@@ -445,6 +447,27 @@ export class TranslationsParent extends JSWindowActorParent {
static LANGUAGE_MODEL_MAJOR_VERSION_MAX = 3;
/**
+ * The shorter the text, the less confidence we should have in the result of the language
+ * identification. Add another heuristic to report the ID as not confident if the length
+ * of the code points of the text is less than this threshold.
+ *
+ * This was determined by plotting a kernel density estimation of the number of times the
+ * source language had to be changed in the SelectTranslationsPanel vs. the code units in
+ * the source text.
+ *
+ * 0013 code units or less - 49.5% of language changes
+ * 0036 code units or less - 74.9% of language changes
+ * 0153 code units or less - 90.0% of language changes
+ * 0200 code units or less - 91.5% of language changes
+ * 0427 code units or less - 95.0% of language changes
+ * 1382 code units or less - 98.0% of language changes
+ * 3506 code units or less - 99.0% of language changes
+ *
+ * @type {number}
+ */
+ static #DOC_CONFIDENCE_THRESHOLD = 150;
+
+ /**
* Contains the state that would affect UI. Anytime this state is changed, a dispatch
* event is sent so that UI can react to it. The actor is inside of /toolkit and
* needs a way of notifying /browser code (or other users) of when the state changes.
@@ -795,7 +818,7 @@ export class TranslationsParent extends JSWindowActorParent {
// popup will not be shown.
if (detectedLanguages.htmlLangAttribute && !detectedLanguages.identified) {
// Compare language langTagsMatch
- detectedLanguages.identified = await this.queryIdentifyLanguage();
+ detectedLanguages.identified = await this.#identifyPageLanguage();
if (
!lazy.TranslationsUtils.langTagsMatch(
@@ -3470,9 +3493,11 @@ export class TranslationsParent extends JSWindowActorParent {
}
/**
+ * Uses the page extractor to identify the current page's language.
+ *
* @returns {Promise<DetectionResult>}
*/
- async queryIdentifyLanguage() {
+ async #identifyPageLanguage() {
if (
TranslationsParent.isInAutomation() &&
!TranslationsParent.#isTranslationsEngineMocked
@@ -3480,13 +3505,39 @@ export class TranslationsParent extends JSWindowActorParent {
// In automation assume English is the language, but don't be confident.
return { confident: false, language: "en", languages: [] };
}
- return this.sendQuery("Translations:IdentifyLanguage").catch(error => {
- if (this.#isDestroyed) {
- // The actor was destroyed while this message was still being resolved.
- return null;
- }
- return Promise.reject(error);
- });
+
+ const actor =
+ this.browsingContext?.currentWindowGlobal?.getActor("PageExtractor");
+
+ if (!actor) {
+ throw new Error("Unable to get the PageExtractor actor.");
+ }
+
+ const startTime = ChromeUtils.now();
+
+ const pageText = await actor.getText();
+ if (this.#isDestroyed) {
+ return { language: "", confident: false, languages: [] };
+ }
+
+ const result = await lazy.LanguageDetector.detectLanguage(pageText);
+ if (this.#isDestroyed) {
+ return { language: "", confident: false, languages: [] };
+ }
+
+ const message = `Identified page language as "${result.language}": ${this.browsingContext?.currentURI?.spec}`;
+ ChromeUtils.addProfilerMarker(
+ "TranslationsParent",
+ { startTime, innerWindowId: this.innerWindowId },
+ message
+ );
+ lazy.console.debug(message);
+
+ if (pageText.length < TranslationsParent.#DOC_CONFIDENCE_THRESHOLD) {
+ result.confident = false;
+ }
+
+ return result;
}
/**
@@ -3529,7 +3580,7 @@ export class TranslationsParent extends JSWindowActorParent {
// Do a final check that the identified language matches the reported language
// tag to ensure that the page isn't reporting the incorrect languages. This
// check is deferred to now for performance considerations.
- langTags.identified = await this.queryIdentifyLanguage();
+ langTags.identified = await this.#identifyPageLanguage();
langTags.docLangTag = langTags.identified.language;
if (
@@ -3763,7 +3814,7 @@ export class TranslationsParent extends JSWindowActorParent {
if (!langTags.docLangTag) {
// If the document's markup had no specified langTag, attempt to identify the
// page's language.
- langTags.identified = await this.queryIdentifyLanguage();
+ langTags.identified = await this.#identifyPageLanguage();
langTags.docLangTag = langTags.identified.language;
maybeNormalizeDocLangTag();
langTags.identified.language = langTags.docLangTag;