tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

commit 6bd0f2609a93f191facac703e722b7a43f8aa255
parent 3ae6567b846f7290fe66f78af21209899f070ca4
Author: Tim Xia <txia@mozilla.com>
Date:   Fri, 10 Oct 2025 18:19:21 +0000

Bug 1984938 - Link previews do not obey text encoding for non-UTF-8 pages, r=Mardak,firefox-ai-ml-reviewers

- source from HTTP Content-Type header
- add test file encoded in shift-jis encoding

Differential Revision: https://phabricator.services.mozilla.com/D264152

Diffstat:
Mbrowser/components/genai/LinkPreviewChild.sys.mjs | 119++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
Mbrowser/components/genai/tests/browser/browser.toml | 3++-
Mbrowser/components/genai/tests/browser/browser_link_preview.js | 21+++++++++++++++++++++
Abrowser/components/genai/tests/browser/data/encodingWithShiftJIS.html | 10++++++++++
4 files changed, 142 insertions(+), 11 deletions(-)

diff --git a/browser/components/genai/LinkPreviewChild.sys.mjs b/browser/components/genai/LinkPreviewChild.sys.mjs @@ -69,11 +69,11 @@ export class LinkPreviewChild extends JSWindowActorChild { const { promise, resolve, reject } = Promise.withResolvers(); const MAX_CONTENT_LENGTH = 5 * 1024 * 1024; // 5 MB limit - let charset = "utf-8"; + let charset = null; const byteChunks = []; let totalLength = 0; channel.asyncOpen({ - onDataAvailable(request, stream, offset, count) { + onDataAvailable: (request, stream, offset, count) => { totalLength += count; if (totalLength > MAX_CONTENT_LENGTH) { request.cancel(Cr.NS_ERROR_FILE_TOO_BIG); @@ -81,7 +81,7 @@ export class LinkPreviewChild extends JSWindowActorChild { byteChunks.push(lazy.NetUtil.readInputStream(stream, count)); } }, - onStartRequest(request) { + onStartRequest: request => { const http = request.QueryInterface(Ci.nsIHttpChannel); // Enforce text/html if provided by server @@ -106,7 +106,7 @@ export class LinkPreviewChild extends JSWindowActorChild { } } catch (ex) {} }, - onStopRequest(_request, status) { + onStopRequest: (_request, status) => { if (Components.isSuccessCode(status)) { const bytes = new Uint8Array(totalLength); let offset = 0; @@ -115,8 +115,16 @@ export class LinkPreviewChild extends JSWindowActorChild { offset += chunk.byteLength; } - const decoder = new TextDecoder(charset); - resolve(decoder.decode(bytes)); + const effectiveCharset = this.sniffCharset(bytes, charset); + let decoded; + try { + // Use a non-fatal decode to be more robust to minor encoding errors. + decoded = new TextDecoder(effectiveCharset).decode(bytes); + } catch (e) { + // Fallback to UTF-8 on decode errors or if the label was unsupported. + decoded = new TextDecoder("utf-8").decode(bytes); + } + resolve(decoded); } else { reject(Components.Exception("Failed to fetch HTML", status)); } @@ -126,6 +134,96 @@ export class LinkPreviewChild extends JSWindowActorChild { } /** + * Sniff an effective charset for the given response bytes using the HTML standard's precedence: + * 1) Byte Order Mark (BOM) + * 2) <meta charset> or http-equiv in the first 8KB of the document + * 3) HTTP Content-Type header charset (if provided and valid) + * 4) Default to utf-8 + * + * @param {Uint8Array} bytes - The raw response bytes. + * @param {string} headerCharset - The charset from the Content-Type header. + * @returns {string} A validated, effective charset label for TextDecoder. + */ + sniffCharset(bytes, headerCharset = "") { + // 1. BOM detection (highest priority) + if ( + bytes.length >= 3 && + bytes[0] === 0xef && + bytes[1] === 0xbb && + bytes[2] === 0xbf + ) { + return "utf-8"; + } + if (bytes.length >= 2) { + if (bytes[0] === 0xfe && bytes[1] === 0xff) { + return "utf-16be"; + } + if (bytes[0] === 0xff && bytes[1] === 0xfe) { + return "utf-16le"; + } + } + + // 2. Scan the first 8KB for a meta-declared charset. This is checked before + // the HTTP header as a heuristic for misconfigured servers where the HTML + // is more likely to be correct. + try { + const headLen = Math.min(bytes.length, 8192); + const head = new TextDecoder("windows-1252").decode( + bytes.subarray(0, headLen) + ); + + const metaCharsetRegex = /<meta\s+charset\s*=\s*["']?([a-z0-9_-]+)/i; + let match = head.match(metaCharsetRegex); + + if (!match) { + const httpEquivRegex = + /<meta\s+http-equiv\s*=\s*["']?content-type["']?[^>]*content\s*=\s*["'][^"']*charset\s*=\s*([a-z0-9_-]+)/i; + match = head.match(httpEquivRegex); + } + + if (match && match[1]) { + const norm = this.normalizeAndValidateEncodingLabel(match[1]); + if (norm) { + return norm; + } + } + } catch (e) { + // Ignore errors during meta scan and fall through. + } + + // 3. Use charset from HTTP header if it's valid. + if (headerCharset) { + const norm = this.normalizeAndValidateEncodingLabel(headerCharset); + if (norm) { + return norm; + } + } + + // 4. Default to UTF-8 if no other charset is found. + return "utf-8"; + } + + /** + * Normalizes a charset label and validates it is supported by TextDecoder. + * + * @param {string} label - The raw encoding label from headers or meta tags. + * @returns {string|null} The normalized, validated label, or null if invalid. + */ + normalizeAndValidateEncodingLabel(label) { + const l = (label || "").trim(); + if (!l) { + return null; + } + try { + // TextDecoder constructor handles aliases and validation. + return new TextDecoder(l).encoding; + } catch (e) { + // The label was invalid or unsupported. + } + return null; + } + + /** * Fetches HTML content from a URL and parses its meta tags and page text. * * @param {string} url - The URL to fetch and parse. @@ -257,11 +355,12 @@ export class LinkPreviewChild extends JSWindowActorChild { ]; metaTags.forEach(tag => { - const name = tag.getAttribute("name") || tag.getAttribute("property"); + const rawName = tag.getAttribute("name") || tag.getAttribute("property"); const content = tag.getAttribute("content"); - if (name && content) { - if (desiredMetaNames.includes(name.toLowerCase())) { - metaInfo[name] = content; + const key = rawName ? rawName.toLowerCase() : null; + if (key && content) { + if (desiredMetaNames.includes(key)) { + metaInfo[key] = content; } } }); diff --git a/browser/components/genai/tests/browser/browser.toml b/browser/components/genai/tests/browser/browser.toml @@ -35,7 +35,8 @@ skip-if = [ ["browser_link_preview.js"] support-files = [ "data/readableFr.html", - "data/readableEn.html" + "data/readableEn.html", + "data/encodingWithShiftJIS.html" ] ["browser_link_preview_nimbus.js"] diff --git a/browser/components/genai/tests/browser/browser_link_preview.js b/browser/components/genai/tests/browser/browser_link_preview.js @@ -419,6 +419,27 @@ add_task(async function test_fetch_page_data() { }); /** + * Test that Shift-JIS encoding is handled correctly. + */ +add_task(async function test_fetch_shift_jis() { + await SpecialPowers.pushPrefEnv({ + set: [["browser.ml.linkPreview.enabled", true]], + }); + const actor = + window.browsingContext.currentWindowContext.getActor("LinkPreview"); + const result = await actor.fetchPageData( + "https://example.com/browser/browser/components/genai/tests/browser/data/encodingWithShiftJIS.html" + ); + + ok(!result.error, "should not have an error"); + is( + result.rawMetaInfo["html:title"], + "Shift-JIS ใƒ†ใ‚นใƒˆ", + "title should be correct" + ); +}); + +/** * Test fetching errors. */ add_task(async function test_fetch_errors() { diff --git a/browser/components/genai/tests/browser/data/encodingWithShiftJIS.html b/browser/components/genai/tests/browser/data/encodingWithShiftJIS.html @@ -0,0 +1,10 @@ +<!DOCTYPE html> +<html> +<head> +<title>Shift-JIS &#12486;&#12473;&#12488;</title> +<meta charset="shift-jis"> +</head> +<body> +<p>&#12300;&#26085;&#26412;&#35486;&#12399;&#12289;&#12402;&#12425;&#12364;&#12394;&#12289;&#12459;&#12479;&#12459;&#12490;&#12289;&#28450;&#23383;&#12392;&#12356;&#12358;3&#31278;&#39006;&#12398;&#25991;&#23383;&#12434;&#20351;&#12358;&#35423;&#38627;&#12394;&#34920;&#35352;&#20307;&#31995;&#12434;&#25345;&#12387;&#12390;&#12356;&#12414;&#12377;&#12290;&#12402;&#12425;&#12364;&#12394;&#12399;&#26085;&#26412;&#35486;&#12398;&#22266;&#26377;&#12398;&#35328;&#33865;&#12420;&#25991;&#27861;&#30340;&#12394;&#35201;&#32032;&#12395;&#20351;&#12431;&#12428;&#12289;&#12459;&#12479;&#12459;&#12490;&#12399;&#22806;&#26469;&#35486;&#12395;&#20351;&#12431;&#12428;&#12414;&#12377;&#12290;&#28450;&#23383;&#12399;&#26085;&#26412;&#35486;&#12395;&#21462;&#12426;&#20837;&#12428;&#12425;&#12428;&#12383;&#20013;&#22269;&#12398;&#25991;&#23383;&#12391;&#12377;&#12290;&#12301;</p> +</body> +</html>