LinkPreviewChild.sys.mjs (13785B)
1 /* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 5 const lazy = {}; 6 ChromeUtils.defineESModuleGetters(lazy, { 7 NetUtil: "resource://gre/modules/NetUtil.sys.mjs", 8 ReaderMode: "moz-src:///toolkit/components/reader/ReaderMode.sys.mjs", 9 Readerable: "resource://gre/modules/Readerable.sys.mjs", 10 isProbablyReaderable: "resource://gre/modules/Readerable.sys.mjs", 11 }); 12 13 /** 14 * Represents a child actor for handling link previews in the browser. 15 * Interacts with content windows and handles events related to link previews. 16 * 17 * @class LinkPreviewChild 18 * @augments {JSWindowActorChild} 19 */ 20 export class LinkPreviewChild extends JSWindowActorChild { 21 /** 22 * Handles incoming messages from the parent actor. 23 * 24 * @param {object} message - The message object containing name and data. 25 * @param {string} message.name - The name of the message. 26 * @param {object} message.data - Data associated with the message. 27 * @returns {Promise<object>|undefined} The result of fetchPageData if applicable. 28 */ 29 async receiveMessage({ name, data }) { 30 if (name === "LinkPreview:FetchPageData") { 31 return this.fetchPageData(data.url); 32 } 33 //expected a return value. consistent-return (eslint) 34 return undefined; 35 } 36 37 /** 38 * Fetches the HTML content from the given URL. 39 * 40 * @param {string} url - The URL to fetch. 41 * @returns {Promise<string>} The HTML content as a string. 42 * @throws {Error} If the fetch fails or the content type is invalid. 43 */ 44 fetchHTML(url) { 45 const uri = lazy.NetUtil.newURI(url); 46 if (!uri.schemeIs("https")) { 47 throw Components.Exception( 48 "Only handling https", 49 Cr.NS_ERROR_UNKNOWN_PROTOCOL 50 ); 51 } 52 53 // Make requests with a channel to automatically get safe browsing checks. 54 // Use null principals in combination with anonymous for now ahead of 55 // fetching content with cookies to handle sites requiring login. 56 const principal = Services.scriptSecurityManager.createNullPrincipal({}); 57 const channel = lazy.NetUtil.newChannel({ 58 contentPolicyType: Ci.nsIContentPolicy.TYPE_DOCUMENT, 59 loadingPrincipal: principal, 60 securityFlags: Ci.nsILoadInfo.SEC_ALLOW_CROSS_ORIGIN_INHERITS_SEC_CONTEXT, 61 triggeringPrincipal: principal, 62 uri, 63 }).QueryInterface(Ci.nsIHttpChannel); 64 channel.loadFlags = Ci.nsIRequest.LOAD_ANONYMOUS; 65 66 // Specially identify this request, e.g., for publishers to opt out 67 channel.setRequestHeader("x-firefox-ai", "1", false); 68 69 const { promise, resolve, reject } = Promise.withResolvers(); 70 const MAX_CONTENT_LENGTH = 5 * 1024 * 1024; // 5 MB limit 71 72 let charset = null; 73 const byteChunks = []; 74 let totalLength = 0; 75 channel.asyncOpen({ 76 onDataAvailable: (request, stream, offset, count) => { 77 totalLength += count; 78 if (totalLength > MAX_CONTENT_LENGTH) { 79 request.cancel(Cr.NS_ERROR_FILE_TOO_BIG); 80 } else { 81 byteChunks.push(lazy.NetUtil.readInputStream(stream, count)); 82 } 83 }, 84 onStartRequest: request => { 85 const http = request.QueryInterface(Ci.nsIHttpChannel); 86 87 // Enforce text/html if provided by server 88 let contentType = ""; 89 try { 90 contentType = http.getResponseHeader("content-type"); 91 } catch (ex) {} 92 if (contentType && !contentType.startsWith("text/html")) { 93 request.cancel(Cr.NS_ERROR_FILE_UNKNOWN_TYPE); 94 } 95 96 // Save charset without quotes or spaces for TextDecoder 97 const match = contentType.match(/charset=["' ]*([^;"' ]+)/i); 98 if (match) { 99 charset = match[1]; 100 } 101 102 // Enforce max length if provided by server 103 try { 104 if (http.getResponseHeader("content-length") > MAX_CONTENT_LENGTH) { 105 request.cancel(Cr.NS_ERROR_FILE_TOO_BIG); 106 } 107 } catch (ex) {} 108 }, 109 onStopRequest: (_request, status) => { 110 if (Components.isSuccessCode(status)) { 111 const bytes = new Uint8Array(totalLength); 112 let offset = 0; 113 for (const chunk of byteChunks) { 114 bytes.set(new Uint8Array(chunk), offset); 115 offset += chunk.byteLength; 116 } 117 118 const effectiveCharset = this.sniffCharset(bytes, charset); 119 let decoded; 120 try { 121 // Use a non-fatal decode to be more robust to minor encoding errors. 122 decoded = new TextDecoder(effectiveCharset).decode(bytes); 123 } catch (e) { 124 // Fallback to UTF-8 on decode errors or if the label was unsupported. 125 decoded = new TextDecoder("utf-8").decode(bytes); 126 } 127 resolve(decoded); 128 } else { 129 reject(Components.Exception("Failed to fetch HTML", status)); 130 } 131 }, 132 }); 133 return promise; 134 } 135 136 /** 137 * Sniff an effective charset for the given response bytes using the HTML standard's precedence: 138 * 1) Byte Order Mark (BOM) 139 * 2) <meta charset> or http-equiv in the first 8KB of the document 140 * 3) HTTP Content-Type header charset (if provided and valid) 141 * 4) Default to utf-8 142 * 143 * @param {Uint8Array} bytes - The raw response bytes. 144 * @param {string} headerCharset - The charset from the Content-Type header. 145 * @returns {string} A validated, effective charset label for TextDecoder. 146 */ 147 sniffCharset(bytes, headerCharset = "") { 148 // 1. BOM detection (highest priority) 149 if ( 150 bytes.length >= 3 && 151 bytes[0] === 0xef && 152 bytes[1] === 0xbb && 153 bytes[2] === 0xbf 154 ) { 155 return "utf-8"; 156 } 157 if (bytes.length >= 2) { 158 if (bytes[0] === 0xfe && bytes[1] === 0xff) { 159 return "utf-16be"; 160 } 161 if (bytes[0] === 0xff && bytes[1] === 0xfe) { 162 return "utf-16le"; 163 } 164 } 165 166 // 2. Scan the first 8KB for a meta-declared charset. This is checked before 167 // the HTTP header as a heuristic for misconfigured servers where the HTML 168 // is more likely to be correct. 169 try { 170 const headLen = Math.min(bytes.length, 8192); 171 const head = new TextDecoder("windows-1252").decode( 172 bytes.subarray(0, headLen) 173 ); 174 175 const metaCharsetRegex = /<meta\s+charset\s*=\s*["']?([a-z0-9_-]+)/i; 176 let match = head.match(metaCharsetRegex); 177 178 if (!match) { 179 const httpEquivRegex = 180 /<meta\s+http-equiv\s*=\s*["']?content-type["']?[^>]*content\s*=\s*["'][^"']*charset\s*=\s*([a-z0-9_-]+)/i; 181 match = head.match(httpEquivRegex); 182 } 183 184 if (match && match[1]) { 185 const norm = this.normalizeAndValidateEncodingLabel(match[1]); 186 if (norm) { 187 return norm; 188 } 189 } 190 } catch (e) { 191 // Ignore errors during meta scan and fall through. 192 } 193 194 // 3. Use charset from HTTP header if it's valid. 195 if (headerCharset) { 196 const norm = this.normalizeAndValidateEncodingLabel(headerCharset); 197 if (norm) { 198 return norm; 199 } 200 } 201 202 // 4. Default to UTF-8 if no other charset is found. 203 return "utf-8"; 204 } 205 206 /** 207 * Normalizes a charset label and validates it is supported by TextDecoder. 208 * 209 * @param {string} label - The raw encoding label from headers or meta tags. 210 * @returns {string|null} The normalized, validated label, or null if invalid. 211 */ 212 normalizeAndValidateEncodingLabel(label) { 213 const l = (label || "").trim(); 214 if (!l) { 215 return null; 216 } 217 try { 218 // TextDecoder constructor handles aliases and validation. 219 return new TextDecoder(l).encoding; 220 } catch (e) { 221 // The label was invalid or unsupported. 222 } 223 return null; 224 } 225 226 /** 227 * Fetches HTML content from a URL and parses its meta tags and page text. 228 * 229 * @param {string} url - The URL to fetch and parse. 230 * @returns {Promise<object>} An object containing meta information, page text, and HTML code. 231 */ 232 async fetchPageData(url) { 233 const ret = { 234 article: {}, 235 rawMetaInfo: {}, 236 url, 237 }; 238 try { 239 const htmlCode = await this.fetchHTML(url); 240 ret.urlComponents = this.extractUrlComponents(url); 241 242 const parser = new DOMParser(); 243 const doc = parser.parseFromString(htmlCode, "text/html"); 244 ret.rawMetaInfo = this.parseMetaTagsFromDoc(doc); 245 246 if ( 247 !lazy.Readerable.shouldCheckUri(lazy.NetUtil.newURI(url)) || 248 !lazy.isProbablyReaderable(doc) 249 ) { 250 // Add normalized metadata even if the document isn't reader-able 251 ret.meta = this.extractNormalizedMetadata(ret.rawMetaInfo); 252 return ret; 253 } 254 255 ret.article = await this.getArticleDataFromDoc(doc); 256 257 ret.meta = this.extractNormalizedMetadata(ret.rawMetaInfo, ret.article); 258 } catch (error) { 259 console.error(`Failed to fetch and parse page data: ${error}`); 260 ret.error = { message: error.message, result: error.result }; 261 // Add empty normalized metadata in case of error 262 ret.meta = this.extractNormalizedMetadata(); 263 } 264 return ret; 265 } 266 267 /** 268 * Extracts and normalizes metadata from the page's meta tags and article content. 269 * 270 * @param {object} metaData - Metadata extracted from the page's meta tags (Open Graph, Twitter, HTML) 271 * @param {object} articleData - Data extracted from the article content using ReaderMode 272 * @returns {object} Normalized metadata containing: 273 * - title: Page title prioritizing Open Graph, Twitter, then HTML title 274 * - description: Content excerpt or meta description from various sources 275 * - imageUrl: HTTPS-only URL of the page's primary image 276 * - isMissingMetadata: Boolean flag indicating if description is missing 277 */ 278 extractNormalizedMetadata(metaData = {}, articleData = {}) { 279 const title = 280 metaData["og:title"] || 281 metaData["twitter:title"] || 282 metaData["html:title"] || 283 ""; 284 285 const description = 286 articleData.excerpt || 287 metaData["og:description"] || 288 metaData["twitter:description"] || 289 metaData.description || 290 ""; 291 292 let imageUrl = metaData["og:image"] || metaData["twitter:image:src"] || ""; 293 294 if (!imageUrl.startsWith("https://")) { 295 imageUrl = ""; 296 } 297 298 return { 299 title, 300 description, 301 imageUrl, 302 }; 303 } 304 305 /** 306 * Extracts URL components including domain and filename. 307 * 308 * @param {string} url - The URL to extract information from. 309 * @returns {object} Object containing domain and filename. 310 */ 311 extractUrlComponents(url) { 312 try { 313 const urlObj = new URL(url); 314 const domain = urlObj.hostname; 315 316 // Extract the filename (last part of pathname) 317 let pathname = urlObj.pathname; 318 // Remove trailing slash if present 319 if (pathname.endsWith("/")) { 320 pathname = pathname.slice(0, -1); 321 } 322 323 // Get last segment of path 324 const pathParts = pathname.split("/"); 325 const filename = pathParts[pathParts.length - 1] || domain; 326 327 return { domain, filename }; 328 } catch (e) { 329 // Return both properties with same fallback value if URL is invalid 330 return { domain: url, filename: url }; 331 } 332 } 333 334 /** 335 * Parses meta tags from the provided Document into a key-value object. 336 * Also extracts the title if available. 337 * 338 * @param {Document} doc - The parsed HTML document. 339 * @returns {object} An object containing meta tag key-value pairs. 340 */ 341 parseMetaTagsFromDoc(doc) { 342 const metaTags = doc.querySelectorAll("meta"); 343 const metaInfo = {}; 344 345 // TODO: Define the meta tags we are interested in 346 const desiredMetaNames = [ 347 "description", 348 "og:image", 349 "title", 350 "og:title", 351 "twitter:title", 352 "og:description", 353 "twitter:description", 354 "twitter:image:src", 355 ]; 356 357 metaTags.forEach(tag => { 358 const rawName = tag.getAttribute("name") || tag.getAttribute("property"); 359 const content = tag.getAttribute("content"); 360 const key = rawName ? rawName.toLowerCase() : null; 361 if (key && content) { 362 if (desiredMetaNames.includes(key)) { 363 metaInfo[key] = content; 364 } 365 } 366 }); 367 368 const title = doc.querySelector("title")?.textContent; 369 if (title) { 370 metaInfo["html:title"] = title; 371 } 372 373 return metaInfo; 374 } 375 376 /** 377 * Extracts article data from the provided Document using ReaderMode. 378 * 379 * @param {Document} doc - The parsed HTML document. 380 * @returns {Promise<object>} The extracted article data including specified fields. 381 */ 382 async getArticleDataFromDoc(doc) { 383 try { 384 const article = await lazy.ReaderMode.parseDocument(doc); 385 if (article) { 386 const { 387 title, 388 byline, 389 content, 390 detectedLanguage, 391 length, 392 siteName, 393 excerpt, 394 readingTimeMinsSlow, 395 readingTimeMinsFast, 396 } = article; 397 398 // parseDocument return a `textContent` that strips structure and newlines, which we need for the model. 399 // So we convert the HTML `content` to plain text directly, preserving formatting and newlines. 400 const textContent = Cc["@mozilla.org/parserutils;1"] 401 .getService(Ci.nsIParserUtils) 402 .convertToPlainText( 403 content, 404 null, 405 0 // No line-wrapping 406 ); 407 408 return { 409 title, 410 byline, 411 textContent, 412 detectedLanguage, 413 length, 414 siteName, 415 excerpt, 416 readingTimeMinsFast, 417 readingTimeMinsSlow, 418 }; 419 } 420 } catch (error) { 421 console.error("Error parsing document with ReaderMode:", error); 422 } 423 424 return {}; 425 } 426 }