[ tor-browser ].git.dasho

commit d97c693d3e02bbb6765181a3a6f808e1d4267eb5
parent ad46d27392b643a581cd52d063fcb0bb43250791
Author: Tim Xia <txia@mozilla.com>
Date:   Fri, 10 Oct 2025 18:19:21 +0000

Bug 1984938 - Link previews do not obey text encoding for non-UTF-8 pages, r=Mardak,firefox-ai-ml-reviewers

- source from HTTP Content-Type header
- add test file encoded in shift-jis encoding

Differential Revision: https://phabricator.services.mozilla.com/D264152

Diffstat:
M browser/components/genai/LinkPreviewChild.sys.mjs  | 119 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
M browser/components/genai/tests/browser/browser.toml  | 3 ++-
M browser/components/genai/tests/browser/browser_link_preview.js  | 21 +++++++++++++++++++++
A browser/components/genai/tests/browser/data/encodingWithShiftJIS.html  | 10 ++++++++++

4 files changed, 142 insertions(+), 11 deletions(-)
diff --git a/browser/components/genai/LinkPreviewChild.sys.mjs b/browser/components/genai/LinkPreviewChild.sys.mjs
@@ -69,11 +69,11 @@ export class LinkPreviewChild extends JSWindowActorChild {
     const { promise, resolve, reject } = Promise.withResolvers();
     const MAX_CONTENT_LENGTH = 5 * 1024 * 1024; // 5 MB limit
 
-    let charset = "utf-8";
+    let charset = null;
     const byteChunks = [];
     let totalLength = 0;
     channel.asyncOpen({
-      onDataAvailable(request, stream, offset, count) {
+      onDataAvailable: (request, stream, offset, count) => {
         totalLength += count;
         if (totalLength > MAX_CONTENT_LENGTH) {
           request.cancel(Cr.NS_ERROR_FILE_TOO_BIG);
@@ -81,7 +81,7 @@ export class LinkPreviewChild extends JSWindowActorChild {
           byteChunks.push(lazy.NetUtil.readInputStream(stream, count));
         }
       },
-      onStartRequest(request) {
+      onStartRequest: request => {
         const http = request.QueryInterface(Ci.nsIHttpChannel);
 
         // Enforce text/html if provided by server
@@ -106,7 +106,7 @@ export class LinkPreviewChild extends JSWindowActorChild {
           }
         } catch (ex) {}
       },
-      onStopRequest(_request, status) {
+      onStopRequest: (_request, status) => {
         if (Components.isSuccessCode(status)) {
           const bytes = new Uint8Array(totalLength);
           let offset = 0;
@@ -115,8 +115,16 @@ export class LinkPreviewChild extends JSWindowActorChild {
             offset += chunk.byteLength;
           }
 
-          const decoder = new TextDecoder(charset);
-          resolve(decoder.decode(bytes));
+          const effectiveCharset = this.sniffCharset(bytes, charset);
+          let decoded;
+          try {
+            // Use a non-fatal decode to be more robust to minor encoding errors.
+            decoded = new TextDecoder(effectiveCharset).decode(bytes);
+          } catch (e) {
+            // Fallback to UTF-8 on decode errors or if the label was unsupported.
+            decoded = new TextDecoder("utf-8").decode(bytes);
+          }
+          resolve(decoded);
         } else {
           reject(Components.Exception("Failed to fetch HTML", status));
         }
@@ -126,6 +134,96 @@ export class LinkPreviewChild extends JSWindowActorChild {
   }
 
   /**
+   * Sniff an effective charset for the given response bytes using the HTML standard's precedence:
+   *   1) Byte Order Mark (BOM)
+   *   2) <meta charset> or http-equiv in the first 8KB of the document
+   *   3) HTTP Content-Type header charset (if provided and valid)
+   *   4) Default to utf-8
+   *
+   * @param {Uint8Array} bytes - The raw response bytes.
+   * @param {string} headerCharset - The charset from the Content-Type header.
+   * @returns {string} A validated, effective charset label for TextDecoder.
+   */
+  sniffCharset(bytes, headerCharset = "") {
+    // 1. BOM detection (highest priority)
+    if (
+      bytes.length >= 3 &&
+      bytes[0] === 0xef &&
+      bytes[1] === 0xbb &&
+      bytes[2] === 0xbf
+    ) {
+      return "utf-8";
+    }
+    if (bytes.length >= 2) {
+      if (bytes[0] === 0xfe && bytes[1] === 0xff) {
+        return "utf-16be";
+      }
+      if (bytes[0] === 0xff && bytes[1] === 0xfe) {
+        return "utf-16le";
+      }
+    }
+
+    // 2. Scan the first 8KB for a meta-declared charset. This is checked before
+    // the HTTP header as a heuristic for misconfigured servers where the HTML
+    // is more likely to be correct.
+    try {
+      const headLen = Math.min(bytes.length, 8192);
+      const head = new TextDecoder("windows-1252").decode(
+        bytes.subarray(0, headLen)
+      );
+
+      const metaCharsetRegex = /<meta\s+charset\s*=\s*["']?([a-z0-9_-]+)/i;
+      let match = head.match(metaCharsetRegex);
+
+      if (!match) {
+        const httpEquivRegex =
+          /<meta\s+http-equiv\s*=\s*["']?content-type["']?[^>]*content\s*=\s*["'][^"']*charset\s*=\s*([a-z0-9_-]+)/i;
+        match = head.match(httpEquivRegex);
+      }
+
+      if (match && match[1]) {
+        const norm = this.normalizeAndValidateEncodingLabel(match[1]);
+        if (norm) {
+          return norm;
+        }
+      }
+    } catch (e) {
+      // Ignore errors during meta scan and fall through.
+    }
+
+    // 3. Use charset from HTTP header if it's valid.
+    if (headerCharset) {
+      const norm = this.normalizeAndValidateEncodingLabel(headerCharset);
+      if (norm) {
+        return norm;
+      }
+    }
+
+    // 4. Default to UTF-8 if no other charset is found.
+    return "utf-8";
+  }
+
+  /**
+   * Normalizes a charset label and validates it is supported by TextDecoder.
+   *
+   * @param {string} label - The raw encoding label from headers or meta tags.
+   * @returns {string|null} The normalized, validated label, or null if invalid.
+   */
+  normalizeAndValidateEncodingLabel(label) {
+    const l = (label || "").trim();
+    if (!l) {
+      return null;
+    }
+    try {
+      // TextDecoder constructor handles aliases and validation.
+      return new TextDecoder(l).encoding;
+    } catch (e) {
+      // The label was invalid or unsupported.
+    }
+    return null;
+  }
+
+  /**
    * Fetches HTML content from a URL and parses its meta tags and page text.
    *
    * @param {string} url - The URL to fetch and parse.
@@ -257,11 +355,12 @@ export class LinkPreviewChild extends JSWindowActorChild {
     ];
 
     metaTags.forEach(tag => {
-      const name = tag.getAttribute("name") || tag.getAttribute("property");
+      const rawName = tag.getAttribute("name") || tag.getAttribute("property");
       const content = tag.getAttribute("content");
-      if (name && content) {
-        if (desiredMetaNames.includes(name.toLowerCase())) {
-          metaInfo[name] = content;
+      const key = rawName ? rawName.toLowerCase() : null;
+      if (key && content) {
+        if (desiredMetaNames.includes(key)) {
+          metaInfo[key] = content;
         }
       }
     });
diff --git a/browser/components/genai/tests/browser/browser.toml b/browser/components/genai/tests/browser/browser.toml
@@ -35,7 +35,8 @@ skip-if = [
 ["browser_link_preview.js"]
 support-files = [
   "data/readableFr.html",
-  "data/readableEn.html"
+  "data/readableEn.html",
+  "data/encodingWithShiftJIS.html"
 ]
 
 ["browser_link_preview_nimbus.js"]
diff --git a/browser/components/genai/tests/browser/browser_link_preview.js b/browser/components/genai/tests/browser/browser_link_preview.js
@@ -419,6 +419,27 @@ add_task(async function test_fetch_page_data() {
 });
 
 /**
+ * Test that Shift-JIS encoding is handled correctly.
+ */
+add_task(async function test_fetch_shift_jis() {
+  await SpecialPowers.pushPrefEnv({
+    set: [["browser.ml.linkPreview.enabled", true]],
+  });
+  const actor =
+    window.browsingContext.currentWindowContext.getActor("LinkPreview");
+  const result = await actor.fetchPageData(
+    "https://example.com/browser/browser/components/genai/tests/browser/data/encodingWithShiftJIS.html"
+  );
+
+  ok(!result.error, "should not have an error");
+  is(
+    result.rawMetaInfo["html:title"],
+    "Shift-JIS テスト",
+    "title should be correct"
+  );
+});
+
+/**
  * Test fetching errors.
  */
 add_task(async function test_fetch_errors() {
diff --git a/browser/components/genai/tests/browser/data/encodingWithShiftJIS.html b/browser/components/genai/tests/browser/data/encodingWithShiftJIS.html
@@ -0,0 +1,10 @@
+<!DOCTYPE html>
+<html>
+<head>
+<title>Shift-JIS &#12486;&#12473;&#12488;</title>
+<meta charset="shift-jis">
+</head>
+<body>
+<p>&#12300;&#26085;&#26412;&#35486;&#12399;&#12289;&#12402;&#12425;&#12364;&#12394;&#12289;&#12459;&#12479;&#12459;&#12490;&#12289;&#28450;&#23383;&#12392;&#12356;&#12358;3&#31278;&#39006;&#12398;&#25991;&#23383;&#12434;&#20351;&#12358;&#35423;&#38627;&#12394;&#34920;&#35352;&#20307;&#31995;&#12434;&#25345;&#12387;&#12390;&#12356;&#12414;&#12377;&#12290;&#12402;&#12425;&#12364;&#12394;&#12399;&#26085;&#26412;&#35486;&#12398;&#22266;&#26377;&#12398;&#35328;&#33865;&#12420;&#25991;&#27861;&#30340;&#12394;&#35201;&#32032;&#12395;&#20351;&#12431;&#12428;&#12289;&#12459;&#12479;&#12459;&#12490;&#12399;&#22806;&#26469;&#35486;&#12395;&#20351;&#12431;&#12428;&#12414;&#12377;&#12290;&#28450;&#23383;&#12399;&#26085;&#26412;&#35486;&#12395;&#21462;&#12426;&#20837;&#12428;&#12425;&#12428;&#12383;&#20013;&#22269;&#12398;&#25991;&#23383;&#12391;&#12377;&#12290;&#12301;</p>
+</body>
+</html>

	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE

M	browser/components/genai/LinkPreviewChild.sys.mjs	\|	119	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
M	browser/components/genai/tests/browser/browser.toml	\|	3	++-
M	browser/components/genai/tests/browser/browser_link_preview.js	\|	21	+++++++++++++++++++++
A	browser/components/genai/tests/browser/data/encodingWithShiftJIS.html	\|	10	++++++++++