commit d97c693d3e02bbb6765181a3a6f808e1d4267eb5
parent ad46d27392b643a581cd52d063fcb0bb43250791
Author: Tim Xia <txia@mozilla.com>
Date: Fri, 10 Oct 2025 18:19:21 +0000
Bug 1984938 - Link previews do not obey text encoding for non-UTF-8 pages, r=Mardak,firefox-ai-ml-reviewers
- source from HTTP Content-Type header
- add test file encoded in shift-jis encoding
Differential Revision: https://phabricator.services.mozilla.com/D264152
Diffstat:
4 files changed, 142 insertions(+), 11 deletions(-)
diff --git a/browser/components/genai/LinkPreviewChild.sys.mjs b/browser/components/genai/LinkPreviewChild.sys.mjs
@@ -69,11 +69,11 @@ export class LinkPreviewChild extends JSWindowActorChild {
const { promise, resolve, reject } = Promise.withResolvers();
const MAX_CONTENT_LENGTH = 5 * 1024 * 1024; // 5 MB limit
- let charset = "utf-8";
+ let charset = null;
const byteChunks = [];
let totalLength = 0;
channel.asyncOpen({
- onDataAvailable(request, stream, offset, count) {
+ onDataAvailable: (request, stream, offset, count) => {
totalLength += count;
if (totalLength > MAX_CONTENT_LENGTH) {
request.cancel(Cr.NS_ERROR_FILE_TOO_BIG);
@@ -81,7 +81,7 @@ export class LinkPreviewChild extends JSWindowActorChild {
byteChunks.push(lazy.NetUtil.readInputStream(stream, count));
}
},
- onStartRequest(request) {
+ onStartRequest: request => {
const http = request.QueryInterface(Ci.nsIHttpChannel);
// Enforce text/html if provided by server
@@ -106,7 +106,7 @@ export class LinkPreviewChild extends JSWindowActorChild {
}
} catch (ex) {}
},
- onStopRequest(_request, status) {
+ onStopRequest: (_request, status) => {
if (Components.isSuccessCode(status)) {
const bytes = new Uint8Array(totalLength);
let offset = 0;
@@ -115,8 +115,16 @@ export class LinkPreviewChild extends JSWindowActorChild {
offset += chunk.byteLength;
}
- const decoder = new TextDecoder(charset);
- resolve(decoder.decode(bytes));
+ const effectiveCharset = this.sniffCharset(bytes, charset);
+ let decoded;
+ try {
+ // Use a non-fatal decode to be more robust to minor encoding errors.
+ decoded = new TextDecoder(effectiveCharset).decode(bytes);
+ } catch (e) {
+ // Fallback to UTF-8 on decode errors or if the label was unsupported.
+ decoded = new TextDecoder("utf-8").decode(bytes);
+ }
+ resolve(decoded);
} else {
reject(Components.Exception("Failed to fetch HTML", status));
}
@@ -126,6 +134,96 @@ export class LinkPreviewChild extends JSWindowActorChild {
}
/**
+ * Sniff an effective charset for the given response bytes using the HTML standard's precedence:
+ * 1) Byte Order Mark (BOM)
+ * 2) <meta charset> or http-equiv in the first 8KB of the document
+ * 3) HTTP Content-Type header charset (if provided and valid)
+ * 4) Default to utf-8
+ *
+ * @param {Uint8Array} bytes - The raw response bytes.
+ * @param {string} headerCharset - The charset from the Content-Type header.
+ * @returns {string} A validated, effective charset label for TextDecoder.
+ */
+ sniffCharset(bytes, headerCharset = "") {
+ // 1. BOM detection (highest priority)
+ if (
+ bytes.length >= 3 &&
+ bytes[0] === 0xef &&
+ bytes[1] === 0xbb &&
+ bytes[2] === 0xbf
+ ) {
+ return "utf-8";
+ }
+ if (bytes.length >= 2) {
+ if (bytes[0] === 0xfe && bytes[1] === 0xff) {
+ return "utf-16be";
+ }
+ if (bytes[0] === 0xff && bytes[1] === 0xfe) {
+ return "utf-16le";
+ }
+ }
+
+ // 2. Scan the first 8KB for a meta-declared charset. This is checked before
+ // the HTTP header as a heuristic for misconfigured servers where the HTML
+ // is more likely to be correct.
+ try {
+ const headLen = Math.min(bytes.length, 8192);
+ const head = new TextDecoder("windows-1252").decode(
+ bytes.subarray(0, headLen)
+ );
+
+ const metaCharsetRegex = /<meta\s+charset\s*=\s*["']?([a-z0-9_-]+)/i;
+ let match = head.match(metaCharsetRegex);
+
+ if (!match) {
+ const httpEquivRegex =
+ /<meta\s+http-equiv\s*=\s*["']?content-type["']?[^>]*content\s*=\s*["'][^"']*charset\s*=\s*([a-z0-9_-]+)/i;
+ match = head.match(httpEquivRegex);
+ }
+
+ if (match && match[1]) {
+ const norm = this.normalizeAndValidateEncodingLabel(match[1]);
+ if (norm) {
+ return norm;
+ }
+ }
+ } catch (e) {
+ // Ignore errors during meta scan and fall through.
+ }
+
+ // 3. Use charset from HTTP header if it's valid.
+ if (headerCharset) {
+ const norm = this.normalizeAndValidateEncodingLabel(headerCharset);
+ if (norm) {
+ return norm;
+ }
+ }
+
+ // 4. Default to UTF-8 if no other charset is found.
+ return "utf-8";
+ }
+
+ /**
+ * Normalizes a charset label and validates it is supported by TextDecoder.
+ *
+ * @param {string} label - The raw encoding label from headers or meta tags.
+ * @returns {string|null} The normalized, validated label, or null if invalid.
+ */
+ normalizeAndValidateEncodingLabel(label) {
+ const l = (label || "").trim();
+ if (!l) {
+ return null;
+ }
+ try {
+ // TextDecoder constructor handles aliases and validation.
+ return new TextDecoder(l).encoding;
+ } catch (e) {
+ // The label was invalid or unsupported.
+ }
+ return null;
+ }
+
+ /**
* Fetches HTML content from a URL and parses its meta tags and page text.
*
* @param {string} url - The URL to fetch and parse.
@@ -257,11 +355,12 @@ export class LinkPreviewChild extends JSWindowActorChild {
];
metaTags.forEach(tag => {
- const name = tag.getAttribute("name") || tag.getAttribute("property");
+ const rawName = tag.getAttribute("name") || tag.getAttribute("property");
const content = tag.getAttribute("content");
- if (name && content) {
- if (desiredMetaNames.includes(name.toLowerCase())) {
- metaInfo[name] = content;
+ const key = rawName ? rawName.toLowerCase() : null;
+ if (key && content) {
+ if (desiredMetaNames.includes(key)) {
+ metaInfo[key] = content;
}
}
});
diff --git a/browser/components/genai/tests/browser/browser.toml b/browser/components/genai/tests/browser/browser.toml
@@ -35,7 +35,8 @@ skip-if = [
["browser_link_preview.js"]
support-files = [
"data/readableFr.html",
- "data/readableEn.html"
+ "data/readableEn.html",
+ "data/encodingWithShiftJIS.html"
]
["browser_link_preview_nimbus.js"]
diff --git a/browser/components/genai/tests/browser/browser_link_preview.js b/browser/components/genai/tests/browser/browser_link_preview.js
@@ -419,6 +419,27 @@ add_task(async function test_fetch_page_data() {
});
/**
+ * Test that Shift-JIS encoding is handled correctly.
+ */
+add_task(async function test_fetch_shift_jis() {
+ await SpecialPowers.pushPrefEnv({
+ set: [["browser.ml.linkPreview.enabled", true]],
+ });
+ const actor =
+ window.browsingContext.currentWindowContext.getActor("LinkPreview");
+ const result = await actor.fetchPageData(
+ "https://example.com/browser/browser/components/genai/tests/browser/data/encodingWithShiftJIS.html"
+ );
+
+ ok(!result.error, "should not have an error");
+ is(
+ result.rawMetaInfo["html:title"],
+ "Shift-JIS ใในใ",
+ "title should be correct"
+ );
+});
+
+/**
* Test fetching errors.
*/
add_task(async function test_fetch_errors() {
diff --git a/browser/components/genai/tests/browser/data/encodingWithShiftJIS.html b/browser/components/genai/tests/browser/data/encodingWithShiftJIS.html
@@ -0,0 +1,10 @@
+<!DOCTYPE html>
+<html>
+<head>
+<title>Shift-JIS テスト</title>
+<meta charset="shift-jis">
+</head>
+<body>
+<p>「日本語は、ひらがな、カタカナ、漢字という3種類の文字を使う詟難な表記体系を持っています。ひらがなは日本語の固有の言葉や文法的な要素に使われ、カタカナは外来語に使われます。漢字は日本語に取り入れられた中国の文字です。」</p>
+</body>
+</html>