SearchBrowsingHistoryDomainBoost.sys.mjs (8859B)
1 /** 2 * This Source Code Form is subject to the terms of the Mozilla Public 3 * License, v. 2.0. If a copy of the MPL was not distributed with this 4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. 5 */ 6 7 /** 8 * SearchBrowsingHistoryDomainBoost 9 * 10 * Temporary heuristic for general-category queries (games, movies, news, etc.) 11 * when semantic embeddings over title/description are insufficient. 12 * 13 * Safe to remove once richer embeddings or better intent classification lands. 14 */ 15 16 export const CATEGORIES_JSON = { 17 language: "en", 18 categories: [ 19 { 20 id: "games", 21 terms: [ 22 "game", 23 "games", 24 "video game", 25 "video games", 26 "pc games", 27 "console games", 28 ], 29 domains: [ 30 "store.steampowered.com", 31 "roblox.com", 32 "ign.com", 33 "gamespot.com", 34 "polygon.com", 35 "metacritic.com", 36 "epicgames.com", 37 "store.playstation.com", 38 "xbox.com", 39 "nintendo.com", 40 ], 41 }, 42 { 43 id: "movies", 44 terms: ["movie", "movies", "film", "films", "cinema"], 45 domains: [ 46 "imdb.com", 47 "rottentomatoes.com", 48 "metacritic.com", 49 "letterboxd.com", 50 "netflix.com", 51 "primevideo.com", 52 "disneyplus.com", 53 "hulu.com", 54 "max.com", 55 ], 56 }, 57 { 58 id: "tv", 59 terms: ["tv show", "tv shows", "show", "shows", "series", "tv series"], 60 domains: [ 61 "imdb.com", 62 "rottentomatoes.com", 63 "metacritic.com", 64 "tvmaze.com", 65 "thetvdb.com", 66 "netflix.com", 67 "primevideo.com", 68 "disneyplus.com", 69 "hulu.com", 70 "max.com", 71 ], 72 }, 73 { 74 id: "books", 75 terms: ["book", "books", "novel", "novels"], 76 domains: [ 77 "goodreads.com", 78 "gutenberg.org", 79 "openlibrary.org", 80 "barnesandnoble.com", 81 "indigo.ca", 82 ], 83 }, 84 { 85 id: "anime", 86 terms: ["anime", "manga"], 87 domains: [ 88 "myanimelist.net", 89 "anilist.co", 90 "kitsu.app", 91 "crunchyroll.com", 92 ], 93 }, 94 { 95 id: "music", 96 terms: ["music", "song", "songs", "album", "albums", "lyrics"], 97 domains: [ 98 "spotify.com", 99 "music.apple.com", 100 "soundcloud.com", 101 "bandcamp.com", 102 "music.youtube.com", 103 ], 104 }, 105 { 106 id: "podcasts", 107 terms: ["podcast", "podcasts"], 108 domains: [ 109 "podcasts.apple.com", 110 "overcast.fm", 111 "pocketcasts.com", 112 "castbox.fm", 113 ], 114 }, 115 { 116 id: "papers_research", 117 terms: [ 118 "paper", 119 "papers", 120 "research paper", 121 "research papers", 122 "academic paper", 123 "academic papers", 124 "journal", 125 "journals", 126 "study", 127 "studies", 128 "publication", 129 "publications", 130 ], 131 domains: [ 132 "scholar.google.com", 133 "arxiv.org", 134 "semanticscholar.org", 135 "pubmed.ncbi.nlm.nih.gov", 136 "researchgate.net", 137 "ieeexplore.ieee.org", 138 "dl.acm.org", 139 "springer.com", 140 "nature.com", 141 "science.org", 142 ], 143 }, 144 { 145 id: "tech_news", 146 terms: ["tech news", "technology news", "startup news"], 147 domains: [ 148 "theverge.com", 149 "techcrunch.com", 150 "wired.com", 151 "arstechnica.com", 152 "engadget.com", 153 ], 154 }, 155 { 156 id: "finance_news", 157 terms: ["finance news", "business news", "market news", "stock news"], 158 domains: [ 159 "bloomberg.com", 160 "wsj.com", 161 "ft.com", 162 "reuters.com", 163 "cnbc.com", 164 ], 165 }, 166 { 167 id: "news", 168 terms: [ 169 "news", 170 "headline", 171 "headlines", 172 "breaking news", 173 "world news", 174 "latest news", 175 ], 176 domains: [ 177 "reuters.com", 178 "apnews.com", 179 "bbc.com", 180 "cnn.com", 181 "nytimes.com", 182 "theguardian.com", 183 "washingtonpost.com", 184 "aljazeera.com", 185 "npr.org", 186 "wsj.com", 187 "bloomberg.com", 188 "ft.com", 189 ], 190 }, 191 { 192 id: "recipes", 193 terms: [ 194 "recipe", 195 "recipes", 196 "cooking", 197 "food", 198 "dinner ideas", 199 "meal prep", 200 ], 201 domains: [ 202 "allrecipes.com", 203 "seriouseats.com", 204 "foodnetwork.com", 205 "bbcgoodfood.com", 206 "epicurious.com", 207 "nytcooking.com", 208 ], 209 }, 210 { 211 id: "travel", 212 terms: ["travel", "hotels", "places", "destinations", "things to do"], 213 domains: [ 214 "tripadvisor.com", 215 "booking.com", 216 "expedia.com", 217 "airbnb.com", 218 "lonelyplanet.com", 219 ], 220 }, 221 ], 222 }; 223 224 /** 225 * Normalizes a query string into a lowercase, space-separated form suitable for matching 226 * and comparison. 227 * 228 * @param {string} s 229 * @returns {string} 230 */ 231 function normalizeQuery(s) { 232 return (s || "") 233 .toLowerCase() 234 .replace(/[^\p{L}\p{N}]+/gu, " ") 235 .replace(/\s+/g, " ") 236 .trim(); 237 } 238 239 /** 240 * Returns the matched category domains if searchTerm looks like a general category query. 241 * Uses phrase matching on normalized query string. 242 * 243 * @param {string} searchTerm 244 * @param {object} [categoriesJson=CATEGORIES_JSON] 245 * @returns {string[]|null} 246 */ 247 export function matchDomains(searchTerm, categoriesJson = CATEGORIES_JSON) { 248 const q = ` ${normalizeQuery(searchTerm)} `; 249 if (!q.trim()) { 250 return null; 251 } 252 253 for (const cat of categoriesJson.categories) { 254 for (const t of cat.terms) { 255 // Pad with spaces to enable whole-token phrase matching via includes. 256 const tt = ` ${normalizeQuery(t)} `; 257 if (tt.trim() && q.includes(tt)) { 258 return cat.domains; 259 } 260 } 261 } 262 263 return null; 264 } 265 266 /** 267 * Builds a SQL WHERE clause for matching `http`/`https` URLs belonging 268 * to the given root domains and their `www` variants. 269 * 270 * @param {string[]} domains 271 * @returns {{ where: string, params: object }} 272 */ 273 function buildDomainUrlWhere(domains) { 274 const clauses = []; 275 const params = {}; 276 let i = 0; 277 278 for (const raw of domains || []) { 279 const d = String(raw).toLowerCase(); 280 if (!d) { 281 continue; 282 } 283 284 // - https://domain/... 285 // - https://www.domain/... 286 params[`d${i}`] = `%://${d}/%`; 287 clauses.push(`lower(url) LIKE :d${i++}`); 288 289 params[`d${i}`] = `%://www.${d}/%`; 290 clauses.push(`lower(url) LIKE :d${i++}`); 291 } 292 293 return { 294 where: clauses.length ? `(${clauses.join(" OR ")})` : "0", 295 params, 296 }; 297 } 298 299 /** 300 * Domain-filtered moz_places query (time-windowed). 301 * 302 * @param {object} params 303 * @param {object} params.conn 304 * @param {string[]} params.domains 305 * @param {number|null} params.startTs 306 * @param {number|null} params.endTs 307 * @param {number} params.historyLimit 308 * @param {Function} params.buildHistoryRow 309 * @returns {Promise<object[]>} 310 */ 311 export async function searchByDomains({ 312 conn, 313 domains, 314 startTs, 315 endTs, 316 historyLimit, 317 buildHistoryRow, 318 }) { 319 if (!conn || !Array.isArray(domains) || !domains.length) { 320 return []; 321 } 322 323 const { where, params } = buildDomainUrlWhere(domains); 324 325 const results = await conn.executeCached( 326 ` 327 SELECT id, 328 title, 329 url, 330 NULL AS distance, 331 visit_count, 332 frecency, 333 last_visit_date, 334 preview_image_url 335 FROM moz_places 336 WHERE frecency <> 0 337 AND (:startTs IS NULL OR last_visit_date >= :startTs) 338 AND (:endTs IS NULL OR last_visit_date <= :endTs) 339 AND ${where} 340 ORDER BY last_visit_date DESC, frecency DESC 341 LIMIT :limit 342 `, 343 { 344 startTs, 345 endTs, 346 limit: historyLimit, 347 ...params, 348 } 349 ); 350 351 const rows = []; 352 for (const row of results) { 353 rows.push(await buildHistoryRow(row)); 354 } 355 return rows; 356 } 357 358 /** 359 * Merge two result lists, keeping `primary` order, then topping up from `secondary`, 360 * while de-duping by url (fallback to id). 361 * 362 * @param {object[]} primary 363 * @param {object[]} secondary 364 * @param {number} limit 365 * @returns {object[]} 366 */ 367 export function mergeDedupe(primary, secondary, limit) { 368 const seen = new Set(); 369 const out = []; 370 371 const keyOf = r => r?.url || r?.id; 372 373 for (const r of primary || []) { 374 const k = keyOf(r); 375 if (!seen.has(k)) { 376 seen.add(k); 377 out.push(r); 378 if (out.length >= limit) { 379 return out; 380 } 381 } 382 } 383 384 for (const r of secondary || []) { 385 const k = keyOf(r); 386 if (!seen.has(k)) { 387 seen.add(k); 388 out.push(r); 389 if (out.length >= limit) { 390 return out; 391 } 392 } 393 } 394 395 return out; 396 } 397 398 export const SearchBrowsingHistoryDomainBoost = Object.freeze({ 399 matchDomains, 400 searchByDomains, 401 mergeDedupe, 402 });