tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

LinkPreviewChild.sys.mjs (13785B)


      1 /* This Source Code Form is subject to the terms of the Mozilla Public
      2 * License, v. 2.0. If a copy of the MPL was not distributed with this
      3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      4 
      5 const lazy = {};
      6 ChromeUtils.defineESModuleGetters(lazy, {
      7  NetUtil: "resource://gre/modules/NetUtil.sys.mjs",
      8  ReaderMode: "moz-src:///toolkit/components/reader/ReaderMode.sys.mjs",
      9  Readerable: "resource://gre/modules/Readerable.sys.mjs",
     10  isProbablyReaderable: "resource://gre/modules/Readerable.sys.mjs",
     11 });
     12 
     13 /**
     14 * Represents a child actor for handling link previews in the browser.
     15 * Interacts with content windows and handles events related to link previews.
     16 *
     17 * @class LinkPreviewChild
     18 * @augments {JSWindowActorChild}
     19 */
     20 export class LinkPreviewChild extends JSWindowActorChild {
     21  /**
     22   * Handles incoming messages from the parent actor.
     23   *
     24   * @param {object} message - The message object containing name and data.
     25   * @param {string} message.name - The name of the message.
     26   * @param {object} message.data - Data associated with the message.
     27   * @returns {Promise<object>|undefined} The result of fetchPageData if applicable.
     28   */
     29  async receiveMessage({ name, data }) {
     30    if (name === "LinkPreview:FetchPageData") {
     31      return this.fetchPageData(data.url);
     32    }
     33    //expected a return value.  consistent-return (eslint)
     34    return undefined;
     35  }
     36 
     37  /**
     38   * Fetches the HTML content from the given URL.
     39   *
     40   * @param {string} url - The URL to fetch.
     41   * @returns {Promise<string>} The HTML content as a string.
     42   * @throws {Error} If the fetch fails or the content type is invalid.
     43   */
     44  fetchHTML(url) {
     45    const uri = lazy.NetUtil.newURI(url);
     46    if (!uri.schemeIs("https")) {
     47      throw Components.Exception(
     48        "Only handling https",
     49        Cr.NS_ERROR_UNKNOWN_PROTOCOL
     50      );
     51    }
     52 
     53    // Make requests with a channel to automatically get safe browsing checks.
     54    // Use null principals in combination with anonymous for now ahead of
     55    // fetching content with cookies to handle sites requiring login.
     56    const principal = Services.scriptSecurityManager.createNullPrincipal({});
     57    const channel = lazy.NetUtil.newChannel({
     58      contentPolicyType: Ci.nsIContentPolicy.TYPE_DOCUMENT,
     59      loadingPrincipal: principal,
     60      securityFlags: Ci.nsILoadInfo.SEC_ALLOW_CROSS_ORIGIN_INHERITS_SEC_CONTEXT,
     61      triggeringPrincipal: principal,
     62      uri,
     63    }).QueryInterface(Ci.nsIHttpChannel);
     64    channel.loadFlags = Ci.nsIRequest.LOAD_ANONYMOUS;
     65 
     66    // Specially identify this request, e.g., for publishers to opt out
     67    channel.setRequestHeader("x-firefox-ai", "1", false);
     68 
     69    const { promise, resolve, reject } = Promise.withResolvers();
     70    const MAX_CONTENT_LENGTH = 5 * 1024 * 1024; // 5 MB limit
     71 
     72    let charset = null;
     73    const byteChunks = [];
     74    let totalLength = 0;
     75    channel.asyncOpen({
     76      onDataAvailable: (request, stream, offset, count) => {
     77        totalLength += count;
     78        if (totalLength > MAX_CONTENT_LENGTH) {
     79          request.cancel(Cr.NS_ERROR_FILE_TOO_BIG);
     80        } else {
     81          byteChunks.push(lazy.NetUtil.readInputStream(stream, count));
     82        }
     83      },
     84      onStartRequest: request => {
     85        const http = request.QueryInterface(Ci.nsIHttpChannel);
     86 
     87        // Enforce text/html if provided by server
     88        let contentType = "";
     89        try {
     90          contentType = http.getResponseHeader("content-type");
     91        } catch (ex) {}
     92        if (contentType && !contentType.startsWith("text/html")) {
     93          request.cancel(Cr.NS_ERROR_FILE_UNKNOWN_TYPE);
     94        }
     95 
     96        // Save charset without quotes or spaces for TextDecoder
     97        const match = contentType.match(/charset=["' ]*([^;"' ]+)/i);
     98        if (match) {
     99          charset = match[1];
    100        }
    101 
    102        // Enforce max length if provided by server
    103        try {
    104          if (http.getResponseHeader("content-length") > MAX_CONTENT_LENGTH) {
    105            request.cancel(Cr.NS_ERROR_FILE_TOO_BIG);
    106          }
    107        } catch (ex) {}
    108      },
    109      onStopRequest: (_request, status) => {
    110        if (Components.isSuccessCode(status)) {
    111          const bytes = new Uint8Array(totalLength);
    112          let offset = 0;
    113          for (const chunk of byteChunks) {
    114            bytes.set(new Uint8Array(chunk), offset);
    115            offset += chunk.byteLength;
    116          }
    117 
    118          const effectiveCharset = this.sniffCharset(bytes, charset);
    119          let decoded;
    120          try {
    121            // Use a non-fatal decode to be more robust to minor encoding errors.
    122            decoded = new TextDecoder(effectiveCharset).decode(bytes);
    123          } catch (e) {
    124            // Fallback to UTF-8 on decode errors or if the label was unsupported.
    125            decoded = new TextDecoder("utf-8").decode(bytes);
    126          }
    127          resolve(decoded);
    128        } else {
    129          reject(Components.Exception("Failed to fetch HTML", status));
    130        }
    131      },
    132    });
    133    return promise;
    134  }
    135 
    136  /**
    137   * Sniff an effective charset for the given response bytes using the HTML standard's precedence:
    138   *   1) Byte Order Mark (BOM)
    139   *   2) <meta charset> or http-equiv in the first 8KB of the document
    140   *   3) HTTP Content-Type header charset (if provided and valid)
    141   *   4) Default to utf-8
    142   *
    143   * @param {Uint8Array} bytes - The raw response bytes.
    144   * @param {string} headerCharset - The charset from the Content-Type header.
    145   * @returns {string} A validated, effective charset label for TextDecoder.
    146   */
    147  sniffCharset(bytes, headerCharset = "") {
    148    // 1. BOM detection (highest priority)
    149    if (
    150      bytes.length >= 3 &&
    151      bytes[0] === 0xef &&
    152      bytes[1] === 0xbb &&
    153      bytes[2] === 0xbf
    154    ) {
    155      return "utf-8";
    156    }
    157    if (bytes.length >= 2) {
    158      if (bytes[0] === 0xfe && bytes[1] === 0xff) {
    159        return "utf-16be";
    160      }
    161      if (bytes[0] === 0xff && bytes[1] === 0xfe) {
    162        return "utf-16le";
    163      }
    164    }
    165 
    166    // 2. Scan the first 8KB for a meta-declared charset. This is checked before
    167    // the HTTP header as a heuristic for misconfigured servers where the HTML
    168    // is more likely to be correct.
    169    try {
    170      const headLen = Math.min(bytes.length, 8192);
    171      const head = new TextDecoder("windows-1252").decode(
    172        bytes.subarray(0, headLen)
    173      );
    174 
    175      const metaCharsetRegex = /<meta\s+charset\s*=\s*["']?([a-z0-9_-]+)/i;
    176      let match = head.match(metaCharsetRegex);
    177 
    178      if (!match) {
    179        const httpEquivRegex =
    180          /<meta\s+http-equiv\s*=\s*["']?content-type["']?[^>]*content\s*=\s*["'][^"']*charset\s*=\s*([a-z0-9_-]+)/i;
    181        match = head.match(httpEquivRegex);
    182      }
    183 
    184      if (match && match[1]) {
    185        const norm = this.normalizeAndValidateEncodingLabel(match[1]);
    186        if (norm) {
    187          return norm;
    188        }
    189      }
    190    } catch (e) {
    191      // Ignore errors during meta scan and fall through.
    192    }
    193 
    194    // 3. Use charset from HTTP header if it's valid.
    195    if (headerCharset) {
    196      const norm = this.normalizeAndValidateEncodingLabel(headerCharset);
    197      if (norm) {
    198        return norm;
    199      }
    200    }
    201 
    202    // 4. Default to UTF-8 if no other charset is found.
    203    return "utf-8";
    204  }
    205 
    206  /**
    207   * Normalizes a charset label and validates it is supported by TextDecoder.
    208   *
    209   * @param {string} label - The raw encoding label from headers or meta tags.
    210   * @returns {string|null} The normalized, validated label, or null if invalid.
    211   */
    212  normalizeAndValidateEncodingLabel(label) {
    213    const l = (label || "").trim();
    214    if (!l) {
    215      return null;
    216    }
    217    try {
    218      // TextDecoder constructor handles aliases and validation.
    219      return new TextDecoder(l).encoding;
    220    } catch (e) {
    221      // The label was invalid or unsupported.
    222    }
    223    return null;
    224  }
    225 
    226  /**
    227   * Fetches HTML content from a URL and parses its meta tags and page text.
    228   *
    229   * @param {string} url - The URL to fetch and parse.
    230   * @returns {Promise<object>} An object containing meta information, page text, and HTML code.
    231   */
    232  async fetchPageData(url) {
    233    const ret = {
    234      article: {},
    235      rawMetaInfo: {},
    236      url,
    237    };
    238    try {
    239      const htmlCode = await this.fetchHTML(url);
    240      ret.urlComponents = this.extractUrlComponents(url);
    241 
    242      const parser = new DOMParser();
    243      const doc = parser.parseFromString(htmlCode, "text/html");
    244      ret.rawMetaInfo = this.parseMetaTagsFromDoc(doc);
    245 
    246      if (
    247        !lazy.Readerable.shouldCheckUri(lazy.NetUtil.newURI(url)) ||
    248        !lazy.isProbablyReaderable(doc)
    249      ) {
    250        // Add normalized metadata even if the document isn't reader-able
    251        ret.meta = this.extractNormalizedMetadata(ret.rawMetaInfo);
    252        return ret;
    253      }
    254 
    255      ret.article = await this.getArticleDataFromDoc(doc);
    256 
    257      ret.meta = this.extractNormalizedMetadata(ret.rawMetaInfo, ret.article);
    258    } catch (error) {
    259      console.error(`Failed to fetch and parse page data: ${error}`);
    260      ret.error = { message: error.message, result: error.result };
    261      // Add empty normalized metadata in case of error
    262      ret.meta = this.extractNormalizedMetadata();
    263    }
    264    return ret;
    265  }
    266 
    267  /**
    268   * Extracts and normalizes metadata from the page's meta tags and article content.
    269   *
    270   * @param {object} metaData - Metadata extracted from the page's meta tags (Open Graph, Twitter, HTML)
    271   * @param {object} articleData - Data extracted from the article content using ReaderMode
    272   * @returns {object} Normalized metadata containing:
    273   *   - title: Page title prioritizing Open Graph, Twitter, then HTML title
    274   *   - description: Content excerpt or meta description from various sources
    275   *   - imageUrl: HTTPS-only URL of the page's primary image
    276   *   - isMissingMetadata: Boolean flag indicating if description is missing
    277   */
    278  extractNormalizedMetadata(metaData = {}, articleData = {}) {
    279    const title =
    280      metaData["og:title"] ||
    281      metaData["twitter:title"] ||
    282      metaData["html:title"] ||
    283      "";
    284 
    285    const description =
    286      articleData.excerpt ||
    287      metaData["og:description"] ||
    288      metaData["twitter:description"] ||
    289      metaData.description ||
    290      "";
    291 
    292    let imageUrl = metaData["og:image"] || metaData["twitter:image:src"] || "";
    293 
    294    if (!imageUrl.startsWith("https://")) {
    295      imageUrl = "";
    296    }
    297 
    298    return {
    299      title,
    300      description,
    301      imageUrl,
    302    };
    303  }
    304 
    305  /**
    306   * Extracts URL components including domain and filename.
    307   *
    308   * @param {string} url - The URL to extract information from.
    309   * @returns {object} Object containing domain and filename.
    310   */
    311  extractUrlComponents(url) {
    312    try {
    313      const urlObj = new URL(url);
    314      const domain = urlObj.hostname;
    315 
    316      // Extract the filename (last part of pathname)
    317      let pathname = urlObj.pathname;
    318      // Remove trailing slash if present
    319      if (pathname.endsWith("/")) {
    320        pathname = pathname.slice(0, -1);
    321      }
    322 
    323      // Get last segment of path
    324      const pathParts = pathname.split("/");
    325      const filename = pathParts[pathParts.length - 1] || domain;
    326 
    327      return { domain, filename };
    328    } catch (e) {
    329      // Return both properties with same fallback value if URL is invalid
    330      return { domain: url, filename: url };
    331    }
    332  }
    333 
    334  /**
    335   * Parses meta tags from the provided Document into a key-value object.
    336   * Also extracts the title if available.
    337   *
    338   * @param {Document} doc - The parsed HTML document.
    339   * @returns {object} An object containing meta tag key-value pairs.
    340   */
    341  parseMetaTagsFromDoc(doc) {
    342    const metaTags = doc.querySelectorAll("meta");
    343    const metaInfo = {};
    344 
    345    // TODO: Define the meta tags we are interested in
    346    const desiredMetaNames = [
    347      "description",
    348      "og:image",
    349      "title",
    350      "og:title",
    351      "twitter:title",
    352      "og:description",
    353      "twitter:description",
    354      "twitter:image:src",
    355    ];
    356 
    357    metaTags.forEach(tag => {
    358      const rawName = tag.getAttribute("name") || tag.getAttribute("property");
    359      const content = tag.getAttribute("content");
    360      const key = rawName ? rawName.toLowerCase() : null;
    361      if (key && content) {
    362        if (desiredMetaNames.includes(key)) {
    363          metaInfo[key] = content;
    364        }
    365      }
    366    });
    367 
    368    const title = doc.querySelector("title")?.textContent;
    369    if (title) {
    370      metaInfo["html:title"] = title;
    371    }
    372 
    373    return metaInfo;
    374  }
    375 
    376  /**
    377   * Extracts article data from the provided Document using ReaderMode.
    378   *
    379   * @param {Document} doc - The parsed HTML document.
    380   * @returns {Promise<object>} The extracted article data including specified fields.
    381   */
    382  async getArticleDataFromDoc(doc) {
    383    try {
    384      const article = await lazy.ReaderMode.parseDocument(doc);
    385      if (article) {
    386        const {
    387          title,
    388          byline,
    389          content,
    390          detectedLanguage,
    391          length,
    392          siteName,
    393          excerpt,
    394          readingTimeMinsSlow,
    395          readingTimeMinsFast,
    396        } = article;
    397 
    398        // parseDocument return a `textContent` that strips structure and newlines, which we need for the model.
    399        // So we convert the HTML `content` to plain text directly, preserving formatting and newlines.
    400        const textContent = Cc["@mozilla.org/parserutils;1"]
    401          .getService(Ci.nsIParserUtils)
    402          .convertToPlainText(
    403            content,
    404            null,
    405            0 // No line-wrapping
    406          );
    407 
    408        return {
    409          title,
    410          byline,
    411          textContent,
    412          detectedLanguage,
    413          length,
    414          siteName,
    415          excerpt,
    416          readingTimeMinsFast,
    417          readingTimeMinsSlow,
    418        };
    419      }
    420    } catch (error) {
    421      console.error("Error parsing document with ReaderMode:", error);
    422    }
    423 
    424    return {};
    425  }
    426 }