[ tor-browser ].git.dasho

commit a07fd9772bbb931e789d55a0af554a7a8850d5b1
parent ccf612b0a4ae5727f7c50623a3e635db90e98c57
Author: Erik Nordin <enordin@mozilla.com>
Date:   Fri, 24 Oct 2025 19:56:59 +0000

Bug 1995634 - Use nsIDocumentEncoder for Translations langId r=translations-reviewers,gregtatum

This patch updates the Translations language-detection algorithm
to utilize nsIDocumentEncoder for the page-text extraction.

Differential Revision: https://phabricator.services.mozilla.com/D269460

Diffstat:
M toolkit/components/translations/actors/TranslationsChild.sys.mjs  | 21 +++++++++++++++++++++
M toolkit/components/translations/actors/TranslationsParent.sys.mjs  | 71 +++++++++++++++++++++++++++++++++++++++++------------------------------

2 files changed, 62 insertions(+), 30 deletions(-)
diff --git a/toolkit/components/translations/actors/TranslationsChild.sys.mjs b/toolkit/components/translations/actors/TranslationsChild.sys.mjs
@@ -78,6 +78,27 @@ export class TranslationsChild extends JSWindowActorChild {
         this.#translatedDoc?.enterLazyTranslationsMode();
         return undefined;
       }
+      case "Translations:ExtractPageText": {
+        const { document } = this;
+        if (!document) {
+          return "";
+        }
+
+        const { sufficientLength } = data;
+
+        const encoder = Cu.createDocumentEncoder("text/plain");
+        encoder.init(
+          document,
+          "text/plain",
+          Ci.nsIDocumentEncoder.OutputBodyOnly |
+            Ci.nsIDocumentEncoder.SkipInvisibleContent |
+            Ci.nsIDocumentEncoder.AllowCrossShadowBoundary |
+            Ci.nsIDocumentEncoder.OutputDropInvisibleBreak |
+            Ci.nsIDocumentEncoder.OutputDisallowLineBreaking
+        );
+
+        return encoder.encodeToStringWithMaxLength(sufficientLength);
+      }
       case "Translations:TranslatePage": {
         if (this.#translatedDoc?.engineStatus === "error") {
           this.#translatedDoc.destroy();
diff --git a/toolkit/components/translations/actors/TranslationsParent.sys.mjs b/toolkit/components/translations/actors/TranslationsParent.sys.mjs
@@ -3554,7 +3554,8 @@ export class TranslationsParent extends JSWindowActorParent {
   }
 
   /**
-   * Uses the page extractor to identify the current page's language.
+   * Extracts a substring of visible text from the content document and
+   * runs it through the language detector to determine the page's language.
    *
    * @returns {Promise<DetectionResult>}
    */
@@ -3563,48 +3564,58 @@ export class TranslationsParent extends JSWindowActorParent {
       return this.languageState.detectedLanguages.identified;
     }
 
-    const actor =
-      this.browsingContext?.currentWindowGlobal?.getActor("PageExtractor");
-
-    if (!actor) {
-      throw new Error("Unable to get the PageExtractor actor.");
-    }
+    lazy.console.log(
+      "Beginning text extraction:",
+      this.browsingContext?.currentURI?.spec
+    );
 
-    const startTime = ChromeUtils.now();
+    const extractionStartTime = ChromeUtils.now();
+    const pageText = await this.sendQuery("Translations:ExtractPageText", {
+      sufficientLength: 4096,
+    });
 
-    // Manual profiling on 10 page loads of https://es.wikipedia.org/wiki/Felis_catus:
-    // -------------------------------------------------------------------------------
-    //
-    //   No limit: 2064 samples, 224/237/294 [min/med/max]ms (~85k code units)
-    // 8192 limit:  681 samples,  75/ 87/128 [min/med/max]ms
-    // 4096 limit:  457 samples,  51/ 55/ 97 [min/med/max]ms
-    // 2048 limit:  240 samples,  29/ 39/ 64 [min/med/max]ms
-    // 1024 limit:  142 samples,  19/ 28/ 58 [min/med/max]ms
-    //
-    // 2048 Code units feels like a decent length for performance and sample size.
-    const pageText = await actor.getText({ sufficientLength: 2048 });
     if (this.#isDestroyed) {
-      return { language: "", confident: false, languages: [] };
+      return { language: "en", confident: false, languages: [] };
     }
 
+    lazy.console.debug(
+      `Extracted Page Text (${pageText.length} code units):\n\n`,
+      pageText
+    );
+
+    const extractionLog =
+      `Extracted ${pageText.length} code units of text in ` +
+      `${(ChromeUtils.now() - extractionStartTime).toFixed(3)} ms.`;
+
+    lazy.console.log(extractionLog);
+    ChromeUtils.addProfilerMarker(
+      "TranslationsParent",
+      { startTime: extractionStartTime, innerWindowId: this.innerWindowId },
+      extractionLog
+    );
+
+    const identificationStartTime = ChromeUtils.now();
     const result = await lazy.LanguageDetector.detectLanguage(pageText);
+
     if (this.#isDestroyed) {
-      return { language: "", confident: false, languages: [] };
+      return { language: "en", confident: false, languages: [] };
     }
 
-    const message =
-      `Identified page language as "${result.language}" ` +
-      `in ${((ChromeUtils.now() - startTime) / 1000).toFixed(3)} seconds: ` +
-      this.browsingContext?.currentURI?.spec;
+    const identificationLog =
+      `Identified ${pageText.length} code units of text as "${result.language}" ` +
+      `in ${(ChromeUtils.now() - identificationStartTime).toFixed(3)} ms.`;
 
+    lazy.console.log(identificationLog);
     ChromeUtils.addProfilerMarker(
       "TranslationsParent",
-      { startTime, innerWindowId: this.innerWindowId },
-      message
+      { startTime: identificationStartTime, innerWindowId: this.innerWindowId },
+      identificationLog
+    );
+    ChromeUtils.addProfilerMarker(
+      "TranslationsParent",
+      { startTime: extractionStartTime, innerWindowId: this.innerWindowId },
+      "Total time to identify page language."
     );
-
-    lazy.console.debug("\nExtracted Page Text:\n\n", pageText);
-    lazy.console.log(message);
 
     if (pageText.length < TranslationsParent.#DOC_CONFIDENCE_THRESHOLD) {
       result.confident = false;

	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE

M	toolkit/components/translations/actors/TranslationsChild.sys.mjs	\|	21	+++++++++++++++++++++
M	toolkit/components/translations/actors/TranslationsParent.sys.mjs	\|	71	+++++++++++++++++++++++++++++++++++++++++------------------------------