PageDataSchema.sys.mjs (6769B)
1 /* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 5 const lazy = {}; 6 7 ChromeUtils.defineESModuleGetters(lazy, { 8 JsonSchemaValidator: 9 "resource://gre/modules/components-utils/JsonSchemaValidator.sys.mjs", 10 OpenGraphPageData: 11 "moz-src:///browser/components/pagedata/OpenGraphPageData.sys.mjs", 12 SchemaOrgPageData: 13 "moz-src:///browser/components/pagedata/SchemaOrgPageData.sys.mjs", 14 TwitterPageData: 15 "moz-src:///browser/components/pagedata/TwitterPageData.sys.mjs", 16 }); 17 18 ChromeUtils.defineLazyGetter(lazy, "logConsole", function () { 19 return console.createInstance({ 20 prefix: "PageData", 21 maxLogLevel: Services.prefs.getBoolPref("browser.pagedata.log", false) 22 ? "Debug" 23 : "Warn", 24 }); 25 }); 26 27 /** 28 * The list of page data collectors. These should be sorted in order of 29 * specificity, if the same piece of data is provided by two collectors then the 30 * earlier wins. 31 * 32 * Collectors must provide a `collect` function which will be passed the 33 * document object and should return the PageData structure. The function may be 34 * asynchronous if needed. 35 * 36 * The data returned need not be valid, collectors should return whatever they 37 * can and then we drop anything that is invalid once all data is joined. 38 */ 39 ChromeUtils.defineLazyGetter(lazy, "DATA_COLLECTORS", function () { 40 return [lazy.SchemaOrgPageData, lazy.OpenGraphPageData, lazy.TwitterPageData]; 41 }); 42 43 let SCHEMAS = new Map(); 44 45 /** 46 * Loads the schema for the given name. 47 * 48 * @param {string} schemaName 49 * The name of the schema to load. 50 * @returns {object} 51 * The loaded schema. 52 */ 53 async function loadSchema(schemaName) { 54 if (SCHEMAS.has(schemaName)) { 55 return SCHEMAS.get(schemaName); 56 } 57 58 let url = `chrome://browser/content/pagedata/schemas/${schemaName.toLocaleLowerCase()}.schema.json`; 59 let response = await fetch(url); 60 if (!response.ok) { 61 throw new Error(`Failed to load schema: ${response.statusText}`); 62 } 63 64 let schema = await response.json(); 65 SCHEMAS.set(schemaName, schema); 66 return schema; 67 } 68 69 /** 70 * Validates the data using the schema with the given name. 71 * 72 * @param {string} schemaName 73 * The name of the schema to validate against. 74 * @param {object} data 75 * The data to validate. 76 */ 77 async function validateData(schemaName, data) { 78 let schema = await loadSchema(schemaName.toLocaleLowerCase()); 79 80 let result = lazy.JsonSchemaValidator.validate(data, schema, { 81 allowExplicitUndefinedProperties: true, 82 // Allowed for future expansion of the schema. 83 allowAdditionalProperties: true, 84 }); 85 86 if (!result.valid) { 87 throw result.error; 88 } 89 } 90 91 /** 92 * A shared API that can be used in parent or child processes 93 */ 94 export const PageDataSchema = { 95 // Enumeration of data types. The keys must match the schema name. 96 DATA_TYPE: Object.freeze({ 97 // Note that 1 and 2 were used as types in earlier versions and should not be used here. 98 PRODUCT: 3, 99 DOCUMENT: 4, 100 ARTICLE: 5, 101 AUDIO: 6, 102 VIDEO: 7, 103 }), 104 105 /** 106 * Gets the data type name. 107 * 108 * @param {DATA_TYPE} type 109 * The data type from the DATA_TYPE enumeration 110 * 111 * @returns {string | null} The name for the type or null if not found. 112 */ 113 nameForType(type) { 114 for (let [name, value] of Object.entries(this.DATA_TYPE)) { 115 if (value == type) { 116 return name; 117 } 118 } 119 120 return null; 121 }, 122 123 /** 124 * Asynchronously validates some page data against the expected schema. Throws 125 * an exception if validation fails. 126 * 127 * @param {DATA_TYPE} type 128 * The data type from the DATA_TYPE enumeration 129 * @param {object} data 130 * The page data 131 */ 132 async validateData(type, data) { 133 let name = this.nameForType(type); 134 135 if (!name) { 136 throw new Error(`Unknown data type ${type}`); 137 } 138 139 await validateData(name, data); 140 }, 141 142 /** 143 * Asynchronously validates an entire PageData structure. Any invalid or 144 * unknown data types are dropped. 145 * 146 * @param {PageData} pageData 147 * The page data 148 * 149 * @returns {PageData} The validated page data structure 150 */ 151 async validatePageData(pageData) { 152 let { data: dataMap = {}, ...general } = pageData; 153 154 await validateData("general", general); 155 156 let validData = {}; 157 158 for (let [type, data] of Object.entries(dataMap)) { 159 let name = this.nameForType(type); 160 // Ignore unknown types here. 161 if (!name) { 162 continue; 163 } 164 165 try { 166 await validateData(name, data); 167 168 validData[type] = data; 169 } catch (e) { 170 // Invalid data is dropped. 171 } 172 } 173 174 return { 175 ...general, 176 data: validData, 177 }; 178 }, 179 180 /** 181 * Adds new page data into an existing data set. Any existing data is not 182 * overwritten. 183 * 184 * @param {PageData} existingPageData 185 * The existing page data 186 * @param {PageData} newPageData 187 * The new page data 188 * 189 * @returns {PageData} The joined data. 190 */ 191 coalescePageData(existingPageData, newPageData) { 192 // Split out the general data from the map of specific data. 193 let { data: existingMap = {}, ...existingGeneral } = existingPageData; 194 let { data: newMap = {}, ...newGeneral } = newPageData; 195 196 Object.assign(newGeneral, existingGeneral); 197 198 let dataMap = {}; 199 for (let [type, data] of Object.entries(existingMap)) { 200 if (type in newMap) { 201 dataMap[type] = Object.assign({}, newMap[type], data); 202 } else { 203 dataMap[type] = data; 204 } 205 } 206 207 for (let [type, data] of Object.entries(newMap)) { 208 if (!(type in dataMap)) { 209 dataMap[type] = data; 210 } 211 } 212 213 return { 214 ...newGeneral, 215 data: dataMap, 216 }; 217 }, 218 219 /** 220 * Collects page data from a DOM document. 221 * 222 * @param {Document} document 223 * The DOM document to collect data from 224 * 225 * @returns {Promise<PageData | null>} The data collected or null in case of 226 * error. 227 */ 228 async collectPageData(document) { 229 lazy.logConsole.debug("Starting collection", document.documentURI); 230 231 let pending = lazy.DATA_COLLECTORS.map(async collector => { 232 try { 233 return await collector.collect(document); 234 } catch (e) { 235 lazy.logConsole.error("Error collecting page data", e); 236 return null; 237 } 238 }); 239 240 let pageDataList = await Promise.all(pending); 241 242 let pageData = pageDataList.reduce(PageDataSchema.coalescePageData, { 243 date: Date.now(), 244 url: document.documentURI, 245 }); 246 247 try { 248 return this.validatePageData(pageData); 249 } catch (e) { 250 lazy.logConsole.error("Failed to collect valid page data", e); 251 return null; 252 } 253 }, 254 };