tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

SchemaOrgPageData.sys.mjs (10709B)


      1 /* This Source Code Form is subject to the terms of the Mozilla Public
      2 * License, v. 2.0. If a copy of the MPL was not distributed with this
      3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      4 
      5 import { PageDataSchema } from "moz-src:///browser/components/pagedata/PageDataSchema.sys.mjs";
      6 
      7 /**
      8 * Represents an item from the schema.org specification.
      9 *
     10 * Every `Item` has a type and a set of properties. Each property has a string
     11 * name and a list of values. It often isn't clear from the spec whether a
     12 * property is expected to have a list of values or just one value so this
     13 * data structure stores every property as a list and provides a simple method
     14 * to get the first property value.
     15 */
     16 class Item {
     17  /** @type {string} The type of the item e.g. "Product" or "Person". */
     18  type;
     19 
     20  /** @type {Map<string, any[]>} Properties of the item. */
     21  properties = new Map();
     22 
     23  /**
     24   * Constructors a new `Item` of the given type.
     25   *
     26   * @param {string} type
     27   *   The type of the item.
     28   */
     29  constructor(type) {
     30    this.type = type;
     31  }
     32 
     33  /**
     34   * Tests whether a property has any values in this item.
     35   *
     36   * @param {string} prop
     37   *   The name of the property.
     38   * @returns {boolean}
     39   */
     40  has(prop) {
     41    return this.properties.has(prop);
     42  }
     43 
     44  /**
     45   * Gets all of the values for a property. This may return an empty array if
     46   * there are no values.
     47   *
     48   * @param {string} prop
     49   *   The name of the property.
     50   * @returns {any[]}
     51   */
     52  all(prop) {
     53    return this.properties.get(prop) ?? [];
     54  }
     55 
     56  /**
     57   * Gets the first value for a property.
     58   *
     59   * @param {string} prop
     60   *   The name of the property.
     61   * @returns {any}
     62   */
     63  get(prop) {
     64    return this.properties.get(prop)?.[0];
     65  }
     66 
     67  /**
     68   * Sets a value for a property.
     69   *
     70   * @param {string} prop
     71   *   The name of the property.
     72   * @param {any} value
     73   *   The value of the property.
     74   */
     75  set(prop, value) {
     76    let props = this.properties.get(prop);
     77    if (props === undefined) {
     78      props = [];
     79      this.properties.set(prop, props);
     80    }
     81 
     82    props.push(value);
     83  }
     84 
     85  /**
     86   * Converts this item to JSON-LD.
     87   *
     88   * Single array properties are converted into simple properties.
     89   *
     90   * @returns {object}
     91   */
     92  toJsonLD() {
     93    /**
     94     * Converts a value to its JSON-LD representation.
     95     *
     96     * @param {any} val
     97     *   The value to convert.
     98     * @returns {any}
     99     */
    100    function toLD(val) {
    101      if (val instanceof Item) {
    102        return val.toJsonLD();
    103      }
    104      return val;
    105    }
    106 
    107    let props = Array.from(this.properties, ([key, value]) => {
    108      if (value.length == 1) {
    109        return [key, toLD(value[0])];
    110      }
    111 
    112      return [key, value.map(toLD)];
    113    });
    114 
    115    return {
    116      "@type": this.type,
    117      ...Object.fromEntries(props),
    118    };
    119  }
    120 }
    121 
    122 /**
    123 * Parses the value for a given microdata property.
    124 * See https://html.spec.whatwg.org/multipage/microdata.html#values for the parsing spec
    125 *
    126 * @param {Element} propElement
    127 *   The property element.
    128 * @returns {any}
    129 *   The value of the property.
    130 */
    131 function parseMicrodataProp(propElement) {
    132  if (propElement.hasAttribute("itemscope")) {
    133    throw new Error(
    134      "Cannot parse a simple property value from an itemscope element."
    135    );
    136  }
    137 
    138  const parseUrl = (urlElement, attr) => {
    139    if (!urlElement.hasAttribute(attr)) {
    140      return "";
    141    }
    142 
    143    let url = URL.parse(
    144      urlElement.getAttribute(attr),
    145      urlElement.ownerDocument.documentURI
    146    );
    147    return url ? url.toString() : "";
    148  };
    149 
    150  switch (propElement.localName) {
    151    case "meta":
    152      return propElement.getAttribute("content") ?? "";
    153    case "audio":
    154    case "embed":
    155    case "iframe":
    156    case "source":
    157    case "track":
    158    case "video":
    159      return parseUrl(propElement, "src");
    160    case "img":
    161      // Some pages may be using a lazy loading approach to images, putting a
    162      // temporary image in "src" while the real image is in a differently
    163      // named attribute. So far we found "content" and "data-src" are common
    164      // names for that attribute.
    165      return (
    166        parseUrl(propElement, "content") ||
    167        parseUrl(propElement, "data-src") ||
    168        parseUrl(propElement, "src")
    169      );
    170    case "object":
    171      return parseUrl(propElement, "data");
    172    case "a":
    173    case "area":
    174    case "link":
    175      return parseUrl(propElement, "href");
    176    case "data":
    177    case "meter":
    178      return propElement.getAttribute("value");
    179    case "time":
    180      if (propElement.hasAtribute("datetime")) {
    181        return propElement.getAttribute("datetime");
    182      }
    183      return propElement.textContent;
    184    default:
    185      // Not mentioned in the spec but sites seem to use it.
    186      if (propElement.hasAttribute("content")) {
    187        return propElement.getAttribute("content");
    188      }
    189      return propElement.textContent;
    190  }
    191 }
    192 
    193 /**
    194 * Collects product data from an item.
    195 *
    196 * @param {Document} document
    197 *   The document the item comes from.
    198 * @param {PageData} pageData
    199 *   The pageData object to add to.
    200 * @param {Item} item
    201 *   The product item.
    202 */
    203 function collectProduct(document, pageData, item) {
    204  if (item.has("image")) {
    205    let url = new URL(item.get("image"), document.documentURI);
    206    pageData.image = url.toString();
    207  }
    208 
    209  if (item.has("description")) {
    210    pageData.description = item.get("description");
    211  }
    212 
    213  pageData.data[PageDataSchema.DATA_TYPE.PRODUCT] = {
    214    name: item.get("name"),
    215  };
    216 
    217  for (let offer of item.all("offers")) {
    218    if (!(offer instanceof Item) || offer.type != "Offer") {
    219      continue;
    220    }
    221 
    222    let price = parseFloat(offer.get("price"));
    223    if (!isNaN(price)) {
    224      pageData.data[PageDataSchema.DATA_TYPE.PRODUCT].price = {
    225        value: price,
    226        currency: offer.get("priceCurrency"),
    227      };
    228 
    229      break;
    230    }
    231  }
    232 }
    233 
    234 /**
    235 * Returns the root microdata items from the given document.
    236 *
    237 * @param {Document} document
    238 *   The DOM document to collect from.
    239 * @returns {Item[]}
    240 */
    241 function collectMicrodataItems(document) {
    242  // First find all of the items in the document.
    243  let itemElements = document.querySelectorAll(
    244    "[itemscope][itemtype^='https://schema.org/'], [itemscope][itemtype^='http://schema.org/']"
    245  );
    246 
    247  /**
    248   * Maps elements to the closest item.
    249   *
    250   * @type {Map<Element, Item>}
    251   */
    252  let items = new Map();
    253 
    254  /**
    255   * Finds the item for an element. Throws if there is no item. Caches the
    256   * result.
    257   *
    258   * @param {Element} element
    259   *   The element to search from.
    260   * @returns {Item}
    261   */
    262  function itemFor(element) {
    263    let item = items.get(element);
    264    if (item) {
    265      return item;
    266    }
    267 
    268    if (!element.parentElement) {
    269      throw new Error("Element has no parent item.");
    270    }
    271 
    272    item = itemFor(element.parentElement);
    273    items.set(element, item);
    274    return item;
    275  }
    276 
    277  for (let element of itemElements) {
    278    let itemType = element.getAttribute("itemtype");
    279    // Strip off the base url
    280    if (itemType.startsWith("https://")) {
    281      itemType = itemType.substring(19);
    282    } else {
    283      itemType = itemType.substring(18);
    284    }
    285 
    286    items.set(element, new Item(itemType));
    287  }
    288 
    289  // The initial roots are just all the items.
    290  let roots = new Set(items.values());
    291 
    292  // Now find all item properties.
    293  let itemProps = document.querySelectorAll(
    294    "[itemscope][itemtype^='https://schema.org/'] [itemprop], [itemscope][itemtype^='http://schema.org/'] [itemprop]"
    295  );
    296 
    297  for (let element of itemProps) {
    298    // The item is always defined above the current element.
    299    let item = itemFor(element.parentElement);
    300 
    301    // The properties value is either a nested item or a simple value.
    302    let propValue = items.get(element) ?? parseMicrodataProp(element);
    303    item.set(element.getAttribute("itemprop"), propValue);
    304 
    305    if (propValue instanceof Item) {
    306      // This item belongs to another item and so is not a root item.
    307      roots.delete(propValue);
    308    }
    309  }
    310 
    311  return [...roots];
    312 }
    313 
    314 /**
    315 * Returns the root JSON-LD items from the given document.
    316 *
    317 * @param {Document} document
    318 *   The DOM document to collect from.
    319 * @returns {Item[]}
    320 */
    321 function collectJsonLDItems(document) {
    322  /**
    323   * The root items.
    324   *
    325   * @type {Item[]}
    326   */
    327  let items = [];
    328 
    329  /**
    330   * Converts a JSON-LD value into an Item if appropriate.
    331   *
    332   * @param {any} val
    333   *   The value to convert.
    334   * @returns {any}
    335   */
    336  function fromLD(val) {
    337    if (typeof val == "object" && "@type" in val) {
    338      let item = new Item(val["@type"]);
    339 
    340      for (let [prop, value] of Object.entries(val)) {
    341        // Ignore meta properties.
    342        if (prop.startsWith("@")) {
    343          continue;
    344        }
    345 
    346        if (!Array.isArray(value)) {
    347          value = [value];
    348        }
    349 
    350        item.properties.set(prop, value.map(fromLD));
    351      }
    352 
    353      return item;
    354    }
    355 
    356    return val;
    357  }
    358 
    359  let scripts = document.querySelectorAll("script[type='application/ld+json'");
    360  for (let script of scripts) {
    361    try {
    362      let content = JSON.parse(script.textContent);
    363 
    364      if (typeof content != "object") {
    365        continue;
    366      }
    367 
    368      if (!("@context" in content)) {
    369        continue;
    370      }
    371 
    372      if (
    373        content["@context"] != "http://schema.org" &&
    374        content["@context"] != "https://schema.org"
    375      ) {
    376        continue;
    377      }
    378 
    379      let item = fromLD(content);
    380      if (item instanceof Item) {
    381        items.push(item);
    382      }
    383    } catch (e) {
    384      // Unparsable content.
    385    }
    386  }
    387 
    388  return items;
    389 }
    390 
    391 /**
    392 * Collects schema.org related data from a page.
    393 *
    394 * Currently only supports HTML Microdata and JSON-LD formats, not RDFa.
    395 */
    396 export const SchemaOrgPageData = {
    397  /**
    398   * Parses and collects the schema.org items from the given document.
    399   * The returned items are the roots, i.e. the top-level items, there may be
    400   * other items as nested properties.
    401   *
    402   * @param {Document} document
    403   *   The DOM document to parse.
    404   * @returns {Item[]}
    405   */
    406  collectItems(document) {
    407    return collectMicrodataItems(document).concat(collectJsonLDItems(document));
    408  },
    409 
    410  /**
    411   * Performs PageData collection from the given document.
    412   *
    413   * @param {Document} document
    414   *   The DOM document to collect from.
    415   * @returns {PageData}
    416   */
    417  collect(document) {
    418    let pageData = { data: {} };
    419 
    420    let items = this.collectItems(document);
    421 
    422    for (let item of items) {
    423      switch (item.type) {
    424        case "Product":
    425          if (!(PageDataSchema.DATA_TYPE.PRODUCT in pageData.data)) {
    426            collectProduct(document, pageData, item);
    427          }
    428          break;
    429        case "Organization":
    430          pageData.siteName = item.get("name");
    431          break;
    432      }
    433    }
    434 
    435    return pageData;
    436  },
    437 };