commit a07fd9772bbb931e789d55a0af554a7a8850d5b1
parent ccf612b0a4ae5727f7c50623a3e635db90e98c57
Author: Erik Nordin <enordin@mozilla.com>
Date: Fri, 24 Oct 2025 19:56:59 +0000
Bug 1995634 - Use nsIDocumentEncoder for Translations langId r=translations-reviewers,gregtatum
This patch updates the Translations language-detection algorithm
to utilize nsIDocumentEncoder for the page-text extraction.
Differential Revision: https://phabricator.services.mozilla.com/D269460
Diffstat:
2 files changed, 62 insertions(+), 30 deletions(-)
diff --git a/toolkit/components/translations/actors/TranslationsChild.sys.mjs b/toolkit/components/translations/actors/TranslationsChild.sys.mjs
@@ -78,6 +78,27 @@ export class TranslationsChild extends JSWindowActorChild {
this.#translatedDoc?.enterLazyTranslationsMode();
return undefined;
}
+ case "Translations:ExtractPageText": {
+ const { document } = this;
+ if (!document) {
+ return "";
+ }
+
+ const { sufficientLength } = data;
+
+ const encoder = Cu.createDocumentEncoder("text/plain");
+ encoder.init(
+ document,
+ "text/plain",
+ Ci.nsIDocumentEncoder.OutputBodyOnly |
+ Ci.nsIDocumentEncoder.SkipInvisibleContent |
+ Ci.nsIDocumentEncoder.AllowCrossShadowBoundary |
+ Ci.nsIDocumentEncoder.OutputDropInvisibleBreak |
+ Ci.nsIDocumentEncoder.OutputDisallowLineBreaking
+ );
+
+ return encoder.encodeToStringWithMaxLength(sufficientLength);
+ }
case "Translations:TranslatePage": {
if (this.#translatedDoc?.engineStatus === "error") {
this.#translatedDoc.destroy();
diff --git a/toolkit/components/translations/actors/TranslationsParent.sys.mjs b/toolkit/components/translations/actors/TranslationsParent.sys.mjs
@@ -3554,7 +3554,8 @@ export class TranslationsParent extends JSWindowActorParent {
}
/**
- * Uses the page extractor to identify the current page's language.
+ * Extracts a substring of visible text from the content document and
+ * runs it through the language detector to determine the page's language.
*
* @returns {Promise<DetectionResult>}
*/
@@ -3563,48 +3564,58 @@ export class TranslationsParent extends JSWindowActorParent {
return this.languageState.detectedLanguages.identified;
}
- const actor =
- this.browsingContext?.currentWindowGlobal?.getActor("PageExtractor");
-
- if (!actor) {
- throw new Error("Unable to get the PageExtractor actor.");
- }
+ lazy.console.log(
+ "Beginning text extraction:",
+ this.browsingContext?.currentURI?.spec
+ );
- const startTime = ChromeUtils.now();
+ const extractionStartTime = ChromeUtils.now();
+ const pageText = await this.sendQuery("Translations:ExtractPageText", {
+ sufficientLength: 4096,
+ });
- // Manual profiling on 10 page loads of https://es.wikipedia.org/wiki/Felis_catus:
- // -------------------------------------------------------------------------------
- //
- // No limit: 2064 samples, 224/237/294 [min/med/max]ms (~85k code units)
- // 8192 limit: 681 samples, 75/ 87/128 [min/med/max]ms
- // 4096 limit: 457 samples, 51/ 55/ 97 [min/med/max]ms
- // 2048 limit: 240 samples, 29/ 39/ 64 [min/med/max]ms
- // 1024 limit: 142 samples, 19/ 28/ 58 [min/med/max]ms
- //
- // 2048 Code units feels like a decent length for performance and sample size.
- const pageText = await actor.getText({ sufficientLength: 2048 });
if (this.#isDestroyed) {
- return { language: "", confident: false, languages: [] };
+ return { language: "en", confident: false, languages: [] };
}
+ lazy.console.debug(
+ `Extracted Page Text (${pageText.length} code units):\n\n`,
+ pageText
+ );
+
+ const extractionLog =
+ `Extracted ${pageText.length} code units of text in ` +
+ `${(ChromeUtils.now() - extractionStartTime).toFixed(3)} ms.`;
+
+ lazy.console.log(extractionLog);
+ ChromeUtils.addProfilerMarker(
+ "TranslationsParent",
+ { startTime: extractionStartTime, innerWindowId: this.innerWindowId },
+ extractionLog
+ );
+
+ const identificationStartTime = ChromeUtils.now();
const result = await lazy.LanguageDetector.detectLanguage(pageText);
+
if (this.#isDestroyed) {
- return { language: "", confident: false, languages: [] };
+ return { language: "en", confident: false, languages: [] };
}
- const message =
- `Identified page language as "${result.language}" ` +
- `in ${((ChromeUtils.now() - startTime) / 1000).toFixed(3)} seconds: ` +
- this.browsingContext?.currentURI?.spec;
+ const identificationLog =
+ `Identified ${pageText.length} code units of text as "${result.language}" ` +
+ `in ${(ChromeUtils.now() - identificationStartTime).toFixed(3)} ms.`;
+ lazy.console.log(identificationLog);
ChromeUtils.addProfilerMarker(
"TranslationsParent",
- { startTime, innerWindowId: this.innerWindowId },
- message
+ { startTime: identificationStartTime, innerWindowId: this.innerWindowId },
+ identificationLog
+ );
+ ChromeUtils.addProfilerMarker(
+ "TranslationsParent",
+ { startTime: extractionStartTime, innerWindowId: this.innerWindowId },
+ "Total time to identify page language."
);
-
- lazy.console.debug("\nExtracted Page Text:\n\n", pageText);
- lazy.console.log(message);
if (pageText.length < TranslationsParent.#DOC_CONFIDENCE_THRESHOLD) {
result.confident = false;