SchemaOrgPageData.sys.mjs (10709B)
1 /* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 5 import { PageDataSchema } from "moz-src:///browser/components/pagedata/PageDataSchema.sys.mjs"; 6 7 /** 8 * Represents an item from the schema.org specification. 9 * 10 * Every `Item` has a type and a set of properties. Each property has a string 11 * name and a list of values. It often isn't clear from the spec whether a 12 * property is expected to have a list of values or just one value so this 13 * data structure stores every property as a list and provides a simple method 14 * to get the first property value. 15 */ 16 class Item { 17 /** @type {string} The type of the item e.g. "Product" or "Person". */ 18 type; 19 20 /** @type {Map<string, any[]>} Properties of the item. */ 21 properties = new Map(); 22 23 /** 24 * Constructors a new `Item` of the given type. 25 * 26 * @param {string} type 27 * The type of the item. 28 */ 29 constructor(type) { 30 this.type = type; 31 } 32 33 /** 34 * Tests whether a property has any values in this item. 35 * 36 * @param {string} prop 37 * The name of the property. 38 * @returns {boolean} 39 */ 40 has(prop) { 41 return this.properties.has(prop); 42 } 43 44 /** 45 * Gets all of the values for a property. This may return an empty array if 46 * there are no values. 47 * 48 * @param {string} prop 49 * The name of the property. 50 * @returns {any[]} 51 */ 52 all(prop) { 53 return this.properties.get(prop) ?? []; 54 } 55 56 /** 57 * Gets the first value for a property. 58 * 59 * @param {string} prop 60 * The name of the property. 61 * @returns {any} 62 */ 63 get(prop) { 64 return this.properties.get(prop)?.[0]; 65 } 66 67 /** 68 * Sets a value for a property. 69 * 70 * @param {string} prop 71 * The name of the property. 72 * @param {any} value 73 * The value of the property. 74 */ 75 set(prop, value) { 76 let props = this.properties.get(prop); 77 if (props === undefined) { 78 props = []; 79 this.properties.set(prop, props); 80 } 81 82 props.push(value); 83 } 84 85 /** 86 * Converts this item to JSON-LD. 87 * 88 * Single array properties are converted into simple properties. 89 * 90 * @returns {object} 91 */ 92 toJsonLD() { 93 /** 94 * Converts a value to its JSON-LD representation. 95 * 96 * @param {any} val 97 * The value to convert. 98 * @returns {any} 99 */ 100 function toLD(val) { 101 if (val instanceof Item) { 102 return val.toJsonLD(); 103 } 104 return val; 105 } 106 107 let props = Array.from(this.properties, ([key, value]) => { 108 if (value.length == 1) { 109 return [key, toLD(value[0])]; 110 } 111 112 return [key, value.map(toLD)]; 113 }); 114 115 return { 116 "@type": this.type, 117 ...Object.fromEntries(props), 118 }; 119 } 120 } 121 122 /** 123 * Parses the value for a given microdata property. 124 * See https://html.spec.whatwg.org/multipage/microdata.html#values for the parsing spec 125 * 126 * @param {Element} propElement 127 * The property element. 128 * @returns {any} 129 * The value of the property. 130 */ 131 function parseMicrodataProp(propElement) { 132 if (propElement.hasAttribute("itemscope")) { 133 throw new Error( 134 "Cannot parse a simple property value from an itemscope element." 135 ); 136 } 137 138 const parseUrl = (urlElement, attr) => { 139 if (!urlElement.hasAttribute(attr)) { 140 return ""; 141 } 142 143 let url = URL.parse( 144 urlElement.getAttribute(attr), 145 urlElement.ownerDocument.documentURI 146 ); 147 return url ? url.toString() : ""; 148 }; 149 150 switch (propElement.localName) { 151 case "meta": 152 return propElement.getAttribute("content") ?? ""; 153 case "audio": 154 case "embed": 155 case "iframe": 156 case "source": 157 case "track": 158 case "video": 159 return parseUrl(propElement, "src"); 160 case "img": 161 // Some pages may be using a lazy loading approach to images, putting a 162 // temporary image in "src" while the real image is in a differently 163 // named attribute. So far we found "content" and "data-src" are common 164 // names for that attribute. 165 return ( 166 parseUrl(propElement, "content") || 167 parseUrl(propElement, "data-src") || 168 parseUrl(propElement, "src") 169 ); 170 case "object": 171 return parseUrl(propElement, "data"); 172 case "a": 173 case "area": 174 case "link": 175 return parseUrl(propElement, "href"); 176 case "data": 177 case "meter": 178 return propElement.getAttribute("value"); 179 case "time": 180 if (propElement.hasAtribute("datetime")) { 181 return propElement.getAttribute("datetime"); 182 } 183 return propElement.textContent; 184 default: 185 // Not mentioned in the spec but sites seem to use it. 186 if (propElement.hasAttribute("content")) { 187 return propElement.getAttribute("content"); 188 } 189 return propElement.textContent; 190 } 191 } 192 193 /** 194 * Collects product data from an item. 195 * 196 * @param {Document} document 197 * The document the item comes from. 198 * @param {PageData} pageData 199 * The pageData object to add to. 200 * @param {Item} item 201 * The product item. 202 */ 203 function collectProduct(document, pageData, item) { 204 if (item.has("image")) { 205 let url = new URL(item.get("image"), document.documentURI); 206 pageData.image = url.toString(); 207 } 208 209 if (item.has("description")) { 210 pageData.description = item.get("description"); 211 } 212 213 pageData.data[PageDataSchema.DATA_TYPE.PRODUCT] = { 214 name: item.get("name"), 215 }; 216 217 for (let offer of item.all("offers")) { 218 if (!(offer instanceof Item) || offer.type != "Offer") { 219 continue; 220 } 221 222 let price = parseFloat(offer.get("price")); 223 if (!isNaN(price)) { 224 pageData.data[PageDataSchema.DATA_TYPE.PRODUCT].price = { 225 value: price, 226 currency: offer.get("priceCurrency"), 227 }; 228 229 break; 230 } 231 } 232 } 233 234 /** 235 * Returns the root microdata items from the given document. 236 * 237 * @param {Document} document 238 * The DOM document to collect from. 239 * @returns {Item[]} 240 */ 241 function collectMicrodataItems(document) { 242 // First find all of the items in the document. 243 let itemElements = document.querySelectorAll( 244 "[itemscope][itemtype^='https://schema.org/'], [itemscope][itemtype^='http://schema.org/']" 245 ); 246 247 /** 248 * Maps elements to the closest item. 249 * 250 * @type {Map<Element, Item>} 251 */ 252 let items = new Map(); 253 254 /** 255 * Finds the item for an element. Throws if there is no item. Caches the 256 * result. 257 * 258 * @param {Element} element 259 * The element to search from. 260 * @returns {Item} 261 */ 262 function itemFor(element) { 263 let item = items.get(element); 264 if (item) { 265 return item; 266 } 267 268 if (!element.parentElement) { 269 throw new Error("Element has no parent item."); 270 } 271 272 item = itemFor(element.parentElement); 273 items.set(element, item); 274 return item; 275 } 276 277 for (let element of itemElements) { 278 let itemType = element.getAttribute("itemtype"); 279 // Strip off the base url 280 if (itemType.startsWith("https://")) { 281 itemType = itemType.substring(19); 282 } else { 283 itemType = itemType.substring(18); 284 } 285 286 items.set(element, new Item(itemType)); 287 } 288 289 // The initial roots are just all the items. 290 let roots = new Set(items.values()); 291 292 // Now find all item properties. 293 let itemProps = document.querySelectorAll( 294 "[itemscope][itemtype^='https://schema.org/'] [itemprop], [itemscope][itemtype^='http://schema.org/'] [itemprop]" 295 ); 296 297 for (let element of itemProps) { 298 // The item is always defined above the current element. 299 let item = itemFor(element.parentElement); 300 301 // The properties value is either a nested item or a simple value. 302 let propValue = items.get(element) ?? parseMicrodataProp(element); 303 item.set(element.getAttribute("itemprop"), propValue); 304 305 if (propValue instanceof Item) { 306 // This item belongs to another item and so is not a root item. 307 roots.delete(propValue); 308 } 309 } 310 311 return [...roots]; 312 } 313 314 /** 315 * Returns the root JSON-LD items from the given document. 316 * 317 * @param {Document} document 318 * The DOM document to collect from. 319 * @returns {Item[]} 320 */ 321 function collectJsonLDItems(document) { 322 /** 323 * The root items. 324 * 325 * @type {Item[]} 326 */ 327 let items = []; 328 329 /** 330 * Converts a JSON-LD value into an Item if appropriate. 331 * 332 * @param {any} val 333 * The value to convert. 334 * @returns {any} 335 */ 336 function fromLD(val) { 337 if (typeof val == "object" && "@type" in val) { 338 let item = new Item(val["@type"]); 339 340 for (let [prop, value] of Object.entries(val)) { 341 // Ignore meta properties. 342 if (prop.startsWith("@")) { 343 continue; 344 } 345 346 if (!Array.isArray(value)) { 347 value = [value]; 348 } 349 350 item.properties.set(prop, value.map(fromLD)); 351 } 352 353 return item; 354 } 355 356 return val; 357 } 358 359 let scripts = document.querySelectorAll("script[type='application/ld+json'"); 360 for (let script of scripts) { 361 try { 362 let content = JSON.parse(script.textContent); 363 364 if (typeof content != "object") { 365 continue; 366 } 367 368 if (!("@context" in content)) { 369 continue; 370 } 371 372 if ( 373 content["@context"] != "http://schema.org" && 374 content["@context"] != "https://schema.org" 375 ) { 376 continue; 377 } 378 379 let item = fromLD(content); 380 if (item instanceof Item) { 381 items.push(item); 382 } 383 } catch (e) { 384 // Unparsable content. 385 } 386 } 387 388 return items; 389 } 390 391 /** 392 * Collects schema.org related data from a page. 393 * 394 * Currently only supports HTML Microdata and JSON-LD formats, not RDFa. 395 */ 396 export const SchemaOrgPageData = { 397 /** 398 * Parses and collects the schema.org items from the given document. 399 * The returned items are the roots, i.e. the top-level items, there may be 400 * other items as nested properties. 401 * 402 * @param {Document} document 403 * The DOM document to parse. 404 * @returns {Item[]} 405 */ 406 collectItems(document) { 407 return collectMicrodataItems(document).concat(collectJsonLDItems(document)); 408 }, 409 410 /** 411 * Performs PageData collection from the given document. 412 * 413 * @param {Document} document 414 * The DOM document to collect from. 415 * @returns {PageData} 416 */ 417 collect(document) { 418 let pageData = { data: {} }; 419 420 let items = this.collectItems(document); 421 422 for (let item of items) { 423 switch (item.type) { 424 case "Product": 425 if (!(PageDataSchema.DATA_TYPE.PRODUCT in pageData.data)) { 426 collectProduct(document, pageData, item); 427 } 428 break; 429 case "Organization": 430 pageData.siteName = item.get("name"); 431 break; 432 } 433 } 434 435 return pageData; 436 }, 437 };