telemetry.sys.mjs (42859B)
1 /* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this file, 3 * You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 5 // Support for Sync-and-FxA-related telemetry, which is submitted in a special-purpose 6 // telemetry ping called the "sync ping", documented here: 7 // 8 // ../../../toolkit/components/telemetry/docs/data/sync-ping.rst 9 // 10 // The sync ping contains identifiers that are linked to the user's Firefox Account 11 // and are separate from the main telemetry client_id, so this file is also responsible 12 // for ensuring that we can delete those pings upon user request, by plumbing its 13 // identifiers into the "deletion-request" ping. 14 15 import { Log } from "resource://gre/modules/Log.sys.mjs"; 16 17 const lazy = {}; 18 19 ChromeUtils.defineESModuleGetters(lazy, { 20 Async: "resource://services-common/async.sys.mjs", 21 AuthenticationError: "resource://services-sync/sync_auth.sys.mjs", 22 FxAccounts: "resource://gre/modules/FxAccounts.sys.mjs", 23 ObjectUtils: "resource://gre/modules/ObjectUtils.sys.mjs", 24 Observers: "resource://services-common/observers.sys.mjs", 25 Resource: "resource://services-sync/resource.sys.mjs", 26 Status: "resource://services-sync/status.sys.mjs", 27 Svc: "resource://services-sync/util.sys.mjs", 28 TelemetryController: "resource://gre/modules/TelemetryController.sys.mjs", 29 TelemetryEnvironment: "resource://gre/modules/TelemetryEnvironment.sys.mjs", 30 TelemetryUtils: "resource://gre/modules/TelemetryUtils.sys.mjs", 31 Weave: "resource://services-sync/main.sys.mjs", 32 }); 33 34 ChromeUtils.defineLazyGetter(lazy, "fxAccounts", () => { 35 return ChromeUtils.importESModule( 36 "resource://gre/modules/FxAccounts.sys.mjs" 37 ).getFxAccountsSingleton(); 38 }); 39 40 import * as constants from "resource://services-sync/constants.sys.mjs"; 41 42 ChromeUtils.defineLazyGetter( 43 lazy, 44 "WeaveService", 45 () => Cc["@mozilla.org/weave/service;1"].getService().wrappedJSObject 46 ); 47 const log = Log.repository.getLogger("Sync.Telemetry"); 48 49 const TOPICS = [ 50 // For tracking change to account/device identifiers. 51 "fxaccounts:new_device_id", 52 "fxaccounts:onlogout", 53 "weave:service:ready", 54 "weave:service:login:got-hashed-id", 55 56 // For whole-of-sync metrics. 57 "weave:service:sync:start", 58 "weave:service:sync:finish", 59 "weave:service:sync:error", 60 61 // For individual engine metrics. 62 "weave:engine:sync:start", 63 "weave:engine:sync:finish", 64 "weave:engine:sync:error", 65 "weave:engine:sync:applied", 66 "weave:engine:sync:step", 67 "weave:engine:sync:uploaded", 68 "weave:engine:validate:finish", 69 "weave:engine:validate:error", 70 71 // For ad-hoc telemetry events. 72 "weave:telemetry:event", 73 "weave:telemetry:histogram", 74 "fxa:telemetry:event", 75 76 "weave:telemetry:migration", 77 ]; 78 79 const PING_FORMAT_VERSION = 1; 80 81 const EMPTY_UID = "0".repeat(32); 82 83 // The set of engines we record telemetry for - any other engines are ignored. 84 const ENGINES = new Set([ 85 "addons", 86 "bookmarks", 87 "clients", 88 "forms", 89 "history", 90 "passwords", 91 "prefs", 92 "tabs", 93 "extension-storage", 94 "addresses", 95 "creditcards", 96 ]); 97 98 function tryGetMonotonicTimestamp() { 99 try { 100 return Services.telemetry.msSinceProcessStart(); 101 } catch (e) { 102 log.warn("Unable to get a monotonic timestamp!"); 103 return -1; 104 } 105 } 106 107 function timeDeltaFrom(monotonicStartTime) { 108 let now = tryGetMonotonicTimestamp(); 109 if (monotonicStartTime !== -1 && now !== -1) { 110 return Math.round(now - monotonicStartTime); 111 } 112 return -1; 113 } 114 115 const NS_ERROR_MODULE_BASE_OFFSET = 0x45; 116 const NS_ERROR_MODULE_NETWORK = 6; 117 118 // A reimplementation of NS_ERROR_GET_MODULE, which surprisingly doesn't seem 119 // to exist anywhere in .js code in a way that can be reused. 120 // This is taken from DownloadCore.sys.mjs. 121 function NS_ERROR_GET_MODULE(code) { 122 return ((code & 0x7fff0000) >> 16) - NS_ERROR_MODULE_BASE_OFFSET; 123 } 124 125 // Converts extra integer fields to strings, rounds floats to three 126 // decimal places (nanosecond precision for timings), and removes profile 127 // directory paths and URLs from potential error messages. 128 function normalizeExtraTelemetryFields(extra) { 129 let result = {}; 130 for (let key in extra) { 131 let value = extra[key]; 132 let type = typeof value; 133 if (type == "string") { 134 result[key] = ErrorSanitizer.cleanErrorMessage(value); 135 } else if (type == "number") { 136 result[key] = Number.isInteger(value) 137 ? value.toString(10) 138 : value.toFixed(3); 139 } else if (type != "undefined") { 140 throw new TypeError( 141 `Invalid type ${type} for extra telemetry field ${key}` 142 ); 143 } 144 } 145 return lazy.ObjectUtils.isEmpty(result) ? undefined : result; 146 } 147 148 // Keps track of the counts of individual records fate during a sync cycle 149 // The main reason this is a class is to keep track of reasons individual records 150 // failure reasons without huge memory overhead. 151 export class SyncedRecordsTelemetry { 152 // applied => number of items that should be applied. 153 // failed => number of items that failed in this sync. 154 // newFailed => number of items that failed for the first time in this sync. 155 // reconciled => number of items that were reconciled. 156 // failedReasons => {name, count} of reasons a record failed 157 incomingCounts = { 158 applied: 0, 159 failed: 0, 160 newFailed: 0, 161 reconciled: 0, 162 failedReasons: null, 163 }; 164 outgoingCounts = { failed: 0, sent: 0, failedReasons: null }; 165 166 addIncomingFailedReason(reason) { 167 if (!this.incomingCounts.failedReasons) { 168 this.incomingCounts.failedReasons = []; 169 } 170 let transformedReason = SyncTelemetry.transformError(reason); 171 // Some errors like http/nss errors don't have an error object 172 // those will be caught by the higher level telemetry 173 if (!transformedReason.error) { 174 return; 175 } 176 177 let index = this.incomingCounts.failedReasons.findIndex( 178 reasons => reasons.name === transformedReason.error 179 ); 180 181 if (index >= 0) { 182 this.incomingCounts.failedReasons[index].count += 1; 183 } else { 184 this.incomingCounts.failedReasons.push({ 185 name: transformedReason.error, 186 count: 1, 187 }); 188 } 189 } 190 191 addOutgoingFailedReason(reason) { 192 if (!this.outgoingCounts.failedReasons) { 193 this.outgoingCounts.failedReasons = []; 194 } 195 let transformedReason = SyncTelemetry.transformError(reason); 196 // Some errors like http/nss errors don't have an error object 197 // those will be caught by the higher level telemetry 198 if (!transformedReason.error) { 199 return; 200 } 201 let index = this.outgoingCounts.failedReasons.findIndex( 202 reasons => reasons.name === transformedReason.error 203 ); 204 if (index >= 0) { 205 this.outgoingCounts.failedReasons[index].count += 1; 206 } else { 207 this.outgoingCounts.failedReasons.push({ 208 name: transformedReason.error, 209 count: 1, 210 }); 211 } 212 } 213 } 214 215 // The `ErrorSanitizer` has 2 main jobs: 216 // * Remove PII from errors, such as the profile dir or URLs the user might 217 // have visited. 218 // * Normalize errors so different locales or operating systems etc don't 219 // generate different messages for the same underlying error. 220 // * [TODO] Normalize errors so environmental factors don't influence message. 221 // For example, timestamps or GUIDs should be replaced with something static. 222 export class ErrorSanitizer { 223 // Things we normalize - this isn't exhaustive, but covers the common error messages we see. 224 // Eg: 225 // > Win error 112 during operation write on file [profileDir]\weave\addonsreconciler.json (Espacio en disco insuficiente. ) 226 // > Win error 112 during operation write on file [profileDir]\weave\addonsreconciler.json (Diskte yeterli yer yok. ) 227 // > <snip many other translations of the error> 228 // > Unix error 28 during operation write on file [profileDir]/weave/addonsreconciler.json (No space left on device) 229 // These tend to crowd out other errors we might care about (eg, top 5 errors for some engines are 230 // variations of the "no space left on device") 231 232 // Note that only errors that have same-but-different errors on Windows and Unix are here - we 233 // still sanitize ones that aren't in these maps to remove the translations etc - eg, 234 // `ERROR_SHARING_VIOLATION` doesn't really have a unix equivalent, so doesn't appear here, but 235 // we still strip the translations to avoid the variants. 236 static E_PERMISSION_DENIED = "OS error [Permission denied]"; 237 static E_NO_FILE_OR_DIR = "OS error [File/Path not found]"; 238 239 static DOMErrorSubstitutions = { 240 NotFoundError: this.E_NO_FILE_OR_DIR, 241 NotAllowedError: this.E_PERMISSION_DENIED, 242 }; 243 244 // IOUtils error messages include the specific nsresult error code that caused them. 245 static NS_ERROR_RE = new RegExp(/ \(NS_ERROR_.*\)$/); 246 247 static #cleanOSErrorMessage(message, error = undefined) { 248 if (DOMException.isInstance(error)) { 249 const sub = this.DOMErrorSubstitutions[error.name]; 250 message = message.replaceAll("\\", "/"); 251 message = message.replace(this.NS_ERROR_RE, ""); 252 if (sub) { 253 return `${sub} ${message}`; 254 } 255 } 256 257 return message; 258 } 259 260 // A regex we can use to replace the profile dir in error messages. We use a 261 // regexp so we can simply replace all case-insensitive occurences. 262 // This escaping function is from: 263 // https://developer.mozilla.org/en/docs/Web/JavaScript/Guide/Regular_Expressions 264 static reProfileDir = new RegExp( 265 PathUtils.profileDir.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), 266 "gi" 267 ); 268 269 /** 270 * Clean an error message, removing PII and normalizing OS-specific messages. 271 * 272 * @param {string} message The error message 273 * @param {Error?} error The error class instance, if any. 274 */ 275 static cleanErrorMessage(message, error = undefined) { 276 // There's a chance the profiledir is in the error string which is PII we 277 // want to avoid including in the ping. 278 message = message.replace(this.reProfileDir, "[profileDir]"); 279 // MSG_INVALID_URL from /dom/bindings/Errors.msg -- no way to access this 280 // directly from JS. 281 if (message.endsWith("is not a valid URL.")) { 282 message = "<URL> is not a valid URL."; 283 } 284 // Try to filter things that look somewhat like a URL (in that they contain a 285 // colon in the middle of non-whitespace), in case anything else is including 286 // these in error messages. Note that JSON.stringified stuff comes through 287 // here, so we explicitly ignore double-quotes as well. 288 message = message.replace(/[^\s"]+:[^\s"]+/g, "<URL>"); 289 290 // Anywhere that's normalized the guid in errors we can easily filter 291 // to make it easier to aggregate these types of errors 292 message = message.replace(/<guid: ([^>]+)>/g, "<GUID>"); 293 294 return this.#cleanOSErrorMessage(message, error); 295 } 296 } 297 298 // This function validates the payload of a telemetry "event" - this can be 299 // removed once there are APIs available for the telemetry modules to collect 300 // these events (bug 1329530) - but for now we simulate that planned API as 301 // best we can. 302 function validateTelemetryEvent(eventDetails) { 303 let { object, method, value, extra } = eventDetails; 304 // Do do basic validation of the params - everything except "extra" must 305 // be a string. method and object are required. 306 if ( 307 typeof method != "string" || 308 typeof object != "string" || 309 (value && typeof value != "string") || 310 (extra && typeof extra != "object") 311 ) { 312 log.warn("Invalid event parameters - wrong types", eventDetails); 313 return false; 314 } 315 // length checks. 316 if ( 317 method.length > 20 || 318 object.length > 20 || 319 (value && value.length > 80) 320 ) { 321 log.warn("Invalid event parameters - wrong lengths", eventDetails); 322 return false; 323 } 324 325 // extra can be falsey, or an object with string names and values. 326 if (extra) { 327 if (Object.keys(extra).length > 10) { 328 log.warn("Invalid event parameters - too many extra keys", eventDetails); 329 return false; 330 } 331 for (let [ename, evalue] of Object.entries(extra)) { 332 if ( 333 typeof ename != "string" || 334 ename.length > 15 || 335 typeof evalue != "string" || 336 evalue.length > 85 337 ) { 338 log.warn( 339 `Invalid event parameters: extra item "${ename} is invalid`, 340 eventDetails 341 ); 342 return false; 343 } 344 } 345 } 346 return true; 347 } 348 349 class EngineRecord { 350 constructor(name) { 351 // startTime is in ms from process start, but is monotonic (unlike Date.now()) 352 // so we need to keep both it and when. 353 this.startTime = tryGetMonotonicTimestamp(); 354 this.name = name; 355 356 // This allows cases like bookmarks-buffered to have a separate name from 357 // the bookmarks engine. 358 let engineImpl = lazy.Weave.Service.engineManager.get(name); 359 if (engineImpl && engineImpl.overrideTelemetryName) { 360 this.overrideTelemetryName = engineImpl.overrideTelemetryName; 361 } 362 } 363 364 toJSON() { 365 let result = { name: this.overrideTelemetryName || this.name }; 366 let properties = [ 367 "took", 368 "status", 369 "failureReason", 370 "incoming", 371 "outgoing", 372 "validation", 373 "steps", 374 ]; 375 for (let property of properties) { 376 result[property] = this[property]; 377 } 378 return result; 379 } 380 381 finished(error) { 382 let took = timeDeltaFrom(this.startTime); 383 if (took >= 0) { 384 this.took = took; 385 } 386 if (error) { 387 this.failureReason = SyncTelemetry.transformError(error); 388 } 389 } 390 391 recordApplied(counts) { 392 if (this.incoming) { 393 log.error( 394 `Incoming records applied multiple times for engine ${this.name}!` 395 ); 396 return; 397 } 398 if (this.name === "clients" && !counts.failed) { 399 // ignore successful application of client records 400 // since otherwise they show up every time and are meaningless. 401 return; 402 } 403 404 let incomingData = {}; 405 406 if (counts.failedReasons) { 407 // sort the failed reasons in desc by count, then take top 10 408 counts.failedReasons = counts.failedReasons 409 .sort((a, b) => b.count - a.count) 410 .slice(0, 10); 411 } 412 // Counts has extra stuff used for logging, but we only care about a few 413 let properties = ["applied", "failed", "failedReasons"]; 414 // Only record non-zero properties and only record incoming at all if 415 // there's at least one property we care about. 416 for (let property of properties) { 417 if (counts[property]) { 418 incomingData[property] = counts[property]; 419 this.incoming = incomingData; 420 } 421 } 422 } 423 424 recordStep(stepResult) { 425 let step = { 426 name: stepResult.name, 427 }; 428 if (stepResult.took >= 0) { 429 step.took = Math.round(stepResult.took); 430 } 431 if (stepResult.counts) { 432 let counts = stepResult.counts.filter(({ count }) => count > 0); 433 if (counts.length) { 434 step.counts = counts; 435 } 436 } 437 if (this.steps) { 438 this.steps.push(step); 439 } else { 440 this.steps = [step]; 441 } 442 } 443 444 recordValidation(validationResult) { 445 if (this.validation) { 446 log.error(`Multiple validations occurred for engine ${this.name}!`); 447 return; 448 } 449 let { problems, version, took, checked } = validationResult; 450 let validation = { 451 version: version || 0, 452 checked: checked || 0, 453 }; 454 if (took >= 0) { 455 validation.took = Math.round(took); 456 } 457 let summarized = problems.filter(({ count }) => count > 0); 458 if (summarized.length) { 459 validation.problems = summarized; 460 } 461 this.validation = validation; 462 } 463 464 recordValidationError(e) { 465 if (this.validation) { 466 log.error(`Multiple validations occurred for engine ${this.name}!`); 467 return; 468 } 469 470 this.validation = { 471 failureReason: SyncTelemetry.transformError(e), 472 }; 473 } 474 475 recordUploaded(counts) { 476 if (counts.sent || counts.failed) { 477 if (!this.outgoing) { 478 this.outgoing = []; 479 } 480 if (counts.failedReasons) { 481 // sort the failed reasons in desc by count, then take top 10 482 counts.failedReasons = counts.failedReasons 483 .sort((a, b) => b.count - a.count) 484 .slice(0, 10); 485 } 486 this.outgoing.push({ 487 sent: counts.sent || undefined, 488 failed: counts.failed || undefined, 489 failedReasons: counts.failedReasons || undefined, 490 }); 491 } 492 } 493 } 494 495 // The record of a single "sync" - typically many of these are submitted in 496 // a single ping (ie, as a 'syncs' array) 497 export class SyncRecord { 498 constructor(allowedEngines, why) { 499 this.allowedEngines = allowedEngines; 500 // Our failure reason. This property only exists in the generated ping if an 501 // error actually occurred. 502 this.failureReason = undefined; 503 this.syncNodeType = null; 504 this.when = Date.now(); 505 this.startTime = tryGetMonotonicTimestamp(); 506 this.took = 0; // will be set later. 507 this.why = why; 508 509 // All engines that have finished (ie, does not include the "current" one) 510 // We omit this from the ping if it's empty. 511 this.engines = []; 512 // The engine that has started but not yet stopped. 513 this.currentEngine = null; 514 } 515 516 toJSON() { 517 let result = { 518 when: this.when, 519 took: this.took, 520 failureReason: this.failureReason, 521 status: this.status, 522 }; 523 if (this.why) { 524 result.why = this.why; 525 } 526 let engines = []; 527 for (let engine of this.engines) { 528 engines.push(engine.toJSON()); 529 } 530 if (engines.length) { 531 result.engines = engines; 532 } 533 return result; 534 } 535 536 finished(error) { 537 this.took = timeDeltaFrom(this.startTime); 538 if (this.currentEngine != null) { 539 log.error( 540 "Finished called for the sync before the current engine finished" 541 ); 542 this.currentEngine.finished(null); 543 this.onEngineStop(this.currentEngine.name); 544 } 545 if (error) { 546 this.failureReason = SyncTelemetry.transformError(error); 547 } 548 549 this.syncNodeType = lazy.Weave.Service.identity.telemetryNodeType; 550 551 // Check for engine statuses. -- We do this now, and not in engine.finished 552 // to make sure any statuses that get set "late" are recorded 553 for (let engine of this.engines) { 554 let status = lazy.Status.engines[engine.name]; 555 if (status && status !== constants.ENGINE_SUCCEEDED) { 556 engine.status = status; 557 } 558 } 559 560 let statusObject = {}; 561 562 let serviceStatus = lazy.Status.service; 563 if (serviceStatus && serviceStatus !== constants.STATUS_OK) { 564 statusObject.service = serviceStatus; 565 this.status = statusObject; 566 } 567 let syncStatus = lazy.Status.sync; 568 if (syncStatus && syncStatus !== constants.SYNC_SUCCEEDED) { 569 statusObject.sync = syncStatus; 570 this.status = statusObject; 571 } 572 } 573 574 onEngineStart(engineName) { 575 if (this._shouldIgnoreEngine(engineName, false)) { 576 return; 577 } 578 579 if (this.currentEngine) { 580 log.error( 581 `Being told that engine ${engineName} has started, but current engine ${this.currentEngine.name} hasn't stopped` 582 ); 583 // Just discard the current engine rather than making up data for it. 584 } 585 this.currentEngine = new EngineRecord(engineName); 586 } 587 588 onEngineStop(engineName, error) { 589 // We only care if it's the current engine if we have a current engine. 590 if (this._shouldIgnoreEngine(engineName, !!this.currentEngine)) { 591 return; 592 } 593 if (!this.currentEngine) { 594 // It's possible for us to get an error before the start message of an engine 595 // (somehow), in which case we still want to record that error. 596 if (!error) { 597 return; 598 } 599 log.error( 600 `Error triggered on ${engineName} when no current engine exists: ${error}` 601 ); 602 this.currentEngine = new EngineRecord(engineName); 603 } 604 this.currentEngine.finished(error); 605 this.engines.push(this.currentEngine); 606 this.currentEngine = null; 607 } 608 609 onEngineApplied(engineName, counts) { 610 if (this._shouldIgnoreEngine(engineName)) { 611 return; 612 } 613 this.currentEngine.recordApplied(counts); 614 } 615 616 onEngineStep(engineName, step) { 617 if (this._shouldIgnoreEngine(engineName)) { 618 return; 619 } 620 this.currentEngine.recordStep(step); 621 } 622 623 onEngineValidated(engineName, validationData) { 624 if (this._shouldIgnoreEngine(engineName, false)) { 625 return; 626 } 627 let engine = this.engines.find(e => e.name === engineName); 628 if ( 629 !engine && 630 this.currentEngine && 631 engineName === this.currentEngine.name 632 ) { 633 engine = this.currentEngine; 634 } 635 if (engine) { 636 engine.recordValidation(validationData); 637 } else { 638 log.warn( 639 `Validation event triggered for engine ${engineName}, which hasn't been synced!` 640 ); 641 } 642 } 643 644 onEngineValidateError(engineName, error) { 645 if (this._shouldIgnoreEngine(engineName, false)) { 646 return; 647 } 648 let engine = this.engines.find(e => e.name === engineName); 649 if ( 650 !engine && 651 this.currentEngine && 652 engineName === this.currentEngine.name 653 ) { 654 engine = this.currentEngine; 655 } 656 if (engine) { 657 engine.recordValidationError(error); 658 } else { 659 log.warn( 660 `Validation failure event triggered for engine ${engineName}, which hasn't been synced!` 661 ); 662 } 663 } 664 665 onEngineUploaded(engineName, counts) { 666 if (this._shouldIgnoreEngine(engineName)) { 667 return; 668 } 669 this.currentEngine.recordUploaded(counts); 670 } 671 672 _shouldIgnoreEngine(engineName, shouldBeCurrent = true) { 673 if (!this.allowedEngines.has(engineName)) { 674 log.info( 675 `Notification for engine ${engineName}, but we aren't recording telemetry for it` 676 ); 677 return true; 678 } 679 if (shouldBeCurrent) { 680 if (!this.currentEngine || engineName != this.currentEngine.name) { 681 log.info(`Notification for engine ${engineName} but it isn't current`); 682 return true; 683 } 684 } 685 return false; 686 } 687 } 688 689 // The entire "sync ping" - it includes all the syncs, events etc recorded in 690 // the ping. 691 class SyncTelemetryImpl { 692 constructor(allowedEngines) { 693 log.manageLevelFromPref("services.sync.log.logger.telemetry"); 694 // This is accessible so we can enable custom engines during tests. 695 this.allowedEngines = allowedEngines; 696 this.current = null; 697 this.setupObservers(); 698 699 this.payloads = []; 700 this.discarded = 0; 701 this.events = []; 702 this.histograms = {}; 703 this.migrations = []; 704 this.maxEventsCount = lazy.Svc.PrefBranch.getIntPref( 705 "telemetry.maxEventsCount", 706 1000 707 ); 708 this.maxPayloadCount = lazy.Svc.PrefBranch.getIntPref( 709 "telemetry.maxPayloadCount" 710 ); 711 this.submissionInterval = 712 lazy.Svc.PrefBranch.getIntPref("telemetry.submissionInterval") * 1000; 713 this.lastSubmissionTime = Services.telemetry.msSinceProcessStart(); 714 this.lastUID = EMPTY_UID; 715 this.lastSyncNodeType = null; 716 this.currentSyncNodeType = null; 717 // Note that the sessionStartDate is somewhat arbitrary - the telemetry 718 // modules themselves just use `new Date()`. This means that our startDate 719 // isn't going to be the same as the sessionStartDate in the main pings, 720 // but that's OK for now - if it's a problem we'd need to change the 721 // telemetry modules to expose what it thinks the sessionStartDate is. 722 let sessionStartDate = new Date(); 723 this.sessionStartDate = lazy.TelemetryUtils.toLocalTimeISOString( 724 lazy.TelemetryUtils.truncateToHours(sessionStartDate) 725 ); 726 lazy.TelemetryController.registerSyncPingShutdown(() => this.shutdown()); 727 } 728 729 sanitizeFxaDeviceId(deviceId) { 730 return lazy.fxAccounts.telemetry.sanitizeDeviceId(deviceId); 731 } 732 733 prepareFxaDevices(devices) { 734 // For non-sync users, the data per device is limited -- just an id and a 735 // type (and not even the id yet). For sync users, if we can correctly map 736 // the fxaDevice to a sync device, then we can get os and version info, 737 // which would be quite unfortunate to lose. 738 let extraInfoMap = new Map(); 739 if (this.syncIsEnabled()) { 740 for (let client of this.getClientsEngineRecords()) { 741 if (client.fxaDeviceId) { 742 extraInfoMap.set(client.fxaDeviceId, { 743 os: client.os, 744 version: client.version, 745 syncID: this.sanitizeFxaDeviceId(client.id), 746 }); 747 } 748 } 749 } 750 // Finally, sanitize and convert to the proper format. 751 return devices.map(d => { 752 let { os, version, syncID } = extraInfoMap.get(d.id) || { 753 os: undefined, 754 version: undefined, 755 syncID: undefined, 756 }; 757 return { 758 id: this.sanitizeFxaDeviceId(d.id) || EMPTY_UID, 759 type: d.type, 760 os, 761 version, 762 syncID, 763 }; 764 }); 765 } 766 767 syncIsEnabled() { 768 return lazy.WeaveService.enabled && lazy.WeaveService.ready; 769 } 770 771 // Separate for testing. 772 getClientsEngineRecords() { 773 if (!this.syncIsEnabled()) { 774 throw new Error("Bug: syncIsEnabled() must be true, check it first"); 775 } 776 return lazy.Weave.Service.clientsEngine.remoteClients; 777 } 778 779 updateFxaDevices(devices) { 780 if (!devices) { 781 return {}; 782 } 783 let me = devices.find(d => d.isCurrentDevice); 784 let id = me ? this.sanitizeFxaDeviceId(me.id) : undefined; 785 let cleanDevices = this.prepareFxaDevices(devices); 786 return { deviceID: id, devices: cleanDevices }; 787 } 788 789 getFxaDevices() { 790 return lazy.fxAccounts.device.recentDeviceList; 791 } 792 793 getPingJSON(reason) { 794 let { devices, deviceID } = this.updateFxaDevices(this.getFxaDevices()); 795 return { 796 os: lazy.TelemetryEnvironment.currentEnvironment.system.os, 797 why: reason, 798 devices, 799 discarded: this.discarded || undefined, 800 version: PING_FORMAT_VERSION, 801 syncs: this.payloads.slice(), 802 uid: this.lastUID, 803 syncNodeType: this.lastSyncNodeType || undefined, 804 deviceID, 805 sessionStartDate: this.sessionStartDate, 806 events: !this.events.length ? undefined : this.events, 807 migrations: !this.migrations.length ? undefined : this.migrations, 808 histograms: !Object.keys(this.histograms).length 809 ? undefined 810 : this.histograms, 811 }; 812 } 813 814 _addMigrationRecord(type, info) { 815 log.debug("Saw telemetry migration info", type, info); 816 // Updates to this need to be documented in `sync-ping.rst` 817 // and considered in the metric definition of `syncs.migrations`. 818 switch (type) { 819 case "webext-storage": 820 this.migrations.push({ 821 type: "webext-storage", 822 entries: +info.entries, 823 entriesSuccessful: +info.entries_successful, 824 extensions: +info.extensions, 825 extensionsSuccessful: +info.extensions_successful, 826 openFailure: !!info.open_failure, 827 }); 828 break; 829 default: 830 throw new Error("Bug: Unknown migration record type " + type); 831 } 832 } 833 834 finish(reason) { 835 // Note that we might be in the middle of a sync right now, and so we don't 836 // want to touch this.current. 837 let result = this.getPingJSON(reason); 838 this.payloads = []; 839 this.discarded = 0; 840 this.events = []; 841 this.migrations = []; 842 this.histograms = {}; 843 this.submit(result); 844 } 845 846 setupObservers() { 847 for (let topic of TOPICS) { 848 lazy.Observers.add(topic, this, this); 849 } 850 } 851 852 shutdown() { 853 this.finish("shutdown"); 854 for (let topic of TOPICS) { 855 lazy.Observers.remove(topic, this, this); 856 } 857 } 858 859 submit(record) { 860 if (!this.isProductionSyncUser()) { 861 return false; 862 } 863 // We still call submit() with possibly illegal payloads so that tests can 864 // know that the ping was built. We don't end up submitting them, however. 865 let numEvents = record.events ? record.events.length : 0; 866 let numMigrations = record.migrations ? record.migrations.length : 0; 867 if (record.syncs.length || numEvents || numMigrations) { 868 log.trace( 869 `submitting ${record.syncs.length} sync record(s) and ` + 870 `${numEvents} event(s) to telemetry` 871 ); 872 lazy.TelemetryController.submitExternalPing("sync", record, { 873 usePingSender: true, 874 }).catch(err => { 875 log.error("failed to submit ping", err); 876 }); 877 878 Glean.syncs.discarded.set(record.discarded); 879 Glean.syncs.hashedFxaUid.set(record.uid); 880 Glean.syncs.hashedDeviceId.set(record.deviceID); 881 Glean.syncs.sessionStartDate.set(record.sessionStartDate * 1000); 882 Glean.syncs.syncNodeType.set(record.syncNodeType); 883 // Glean object metrics forbid the use of e.g. 'type' (bug 1945220), 884 // so we can't use record.syncs directly. 885 let syncs = structuredClone(record.syncs); 886 for (let sync of syncs) { 887 if (!("devices" in sync)) { 888 continue; 889 } 890 for (let device of sync.devices) { 891 device.device_type = device.type; 892 delete device.type; 893 } 894 } 895 Glean.syncs.syncs.set(syncs); 896 if (numMigrations) { 897 // Glean object metrics forbid the use of e.g. 'type' (bug 1945220), 898 // so we can't use record.migrations directly. 899 let migrations = structuredClone(record.migrations); 900 for (let migration of migrations) { 901 migration.migration_type = migration.type; 902 delete migration.type; 903 } 904 Glean.syncs.migrations.set(migrations); 905 } 906 GleanPings.sync.submit(record.why); 907 return true; 908 } 909 return false; 910 } 911 912 isProductionSyncUser() { 913 // If FxA isn't production then we treat sync as not being production. 914 // Further, there's the deprecated "services.sync.tokenServerURI" pref we 915 // need to consider - fxa doesn't consider that as if that's the only 916 // pref set, they *are* running a production fxa, just not production sync. 917 if ( 918 !lazy.FxAccounts.config.isProductionConfig() || 919 Services.prefs.prefHasUserValue("services.sync.tokenServerURI") 920 ) { 921 log.trace(`Not sending telemetry ping for self-hosted Sync user`); 922 return false; 923 } 924 return true; 925 } 926 927 onSyncStarted(data) { 928 const why = data && JSON.parse(data).why; 929 if (this.current) { 930 log.warn( 931 "Observed weave:service:sync:start, but we're already recording a sync!" 932 ); 933 // Just discard the old record, consistent with our handling of engines, above. 934 this.current = null; 935 } 936 this.current = new SyncRecord(this.allowedEngines, why); 937 } 938 939 // We need to ensure that the telemetry `deletion-request` ping always contains the user's 940 // current sync device ID, because if the user opts out of telemetry then the deletion ping 941 // will be immediately triggered for sending, and we won't have a chance to fill it in later. 942 // This keeps the `deletion-ping` up-to-date when the user's account state changes. 943 onAccountInitOrChange() { 944 // We don't submit sync pings for self-hosters, so don't need to collect their device ids either. 945 if (!this.isProductionSyncUser()) { 946 return; 947 } 948 // Awkwardly async, but no need to await. If the user's account state changes while 949 // this promise is in flight, it will reject and we won't record any data in the ping. 950 // (And a new notification will trigger us to try again with the new state). 951 lazy.fxAccounts.device 952 .getLocalId() 953 .then(deviceId => { 954 let sanitizedDeviceId = 955 lazy.fxAccounts.telemetry.sanitizeDeviceId(deviceId); 956 // In the past we did not persist the FxA metrics identifiers to disk, 957 // so this might be missing until we can fetch it from the server for the 958 // first time. There will be a fresh notification tirggered when it's available. 959 if (sanitizedDeviceId) { 960 // Sanitized device ids are 64 characters long, but telemetry limits scalar strings to 50. 961 // The first 32 chars are sufficient to uniquely identify the device, so just send those. 962 // It's hard to change the sync ping itself to only send 32 chars, to b/w compat reasons. 963 sanitizedDeviceId = sanitizedDeviceId.substr(0, 32); 964 Glean.deletionRequest.syncDeviceId.set(sanitizedDeviceId); 965 } 966 }) 967 .catch(err => { 968 log.warn( 969 `Failed to set sync identifiers in the deletion-request ping: ${err}` 970 ); 971 }); 972 } 973 974 // This keeps the `deletion-request` ping up-to-date when the user signs out, 975 // clearing the now-nonexistent sync device id. 976 onAccountLogout() { 977 Glean.deletionRequest.syncDeviceId.set(""); 978 } 979 980 _checkCurrent(topic) { 981 if (!this.current) { 982 // This is only `info` because it happens when we do a tabs "quick-write" 983 log.info( 984 `Observed notification ${topic} but no current sync is being recorded.` 985 ); 986 return false; 987 } 988 return true; 989 } 990 991 _shouldSubmitForDataChange() { 992 let newID = lazy.fxAccounts.telemetry.getSanitizedUID() || EMPTY_UID; 993 let oldID = this.lastUID; 994 if ( 995 newID != EMPTY_UID && 996 oldID != EMPTY_UID && 997 // Both are "real" uids, so we care if they've changed. 998 newID != oldID 999 ) { 1000 log.trace( 1001 `shouldSubmitForDataChange - uid from '${oldID}' -> '${newID}'` 1002 ); 1003 return true; 1004 } 1005 // We've gone from knowing one of the ids to not knowing it (which we 1006 // ignore) or we've gone from not knowing it to knowing it (which is fine), 1007 // Now check the node type because a change there also means we should 1008 // submit. 1009 if ( 1010 this.lastSyncNodeType && 1011 this.currentSyncNodeType != this.lastSyncNodeType 1012 ) { 1013 log.trace( 1014 `shouldSubmitForDataChange - nodeType from '${this.lastSyncNodeType}' -> '${this.currentSyncNodeType}'` 1015 ); 1016 return true; 1017 } 1018 log.trace("shouldSubmitForDataChange - no need to submit"); 1019 return false; 1020 } 1021 1022 maybeSubmitForDataChange() { 1023 if (this._shouldSubmitForDataChange()) { 1024 log.info( 1025 "Early submission of sync telemetry due to changed IDs/NodeType" 1026 ); 1027 this.finish("idchange"); // this actually submits. 1028 this.lastSubmissionTime = Services.telemetry.msSinceProcessStart(); 1029 } 1030 1031 // Only update the last UIDs if we actually know them. 1032 let current_uid = lazy.fxAccounts.telemetry.getSanitizedUID(); 1033 if (current_uid) { 1034 this.lastUID = current_uid; 1035 } 1036 if (this.currentSyncNodeType) { 1037 this.lastSyncNodeType = this.currentSyncNodeType; 1038 } 1039 } 1040 1041 maybeSubmitForInterval() { 1042 // We want to submit the ping every `this.submissionInterval` but only when 1043 // there's no current sync in progress, otherwise we may end up submitting 1044 // the sync and the events caused by it in different pings. 1045 if ( 1046 this.current == null && 1047 Services.telemetry.msSinceProcessStart() - this.lastSubmissionTime > 1048 this.submissionInterval 1049 ) { 1050 this.finish("schedule"); 1051 this.lastSubmissionTime = Services.telemetry.msSinceProcessStart(); 1052 } 1053 } 1054 1055 onSyncFinished(error) { 1056 if (!this.current) { 1057 log.warn("onSyncFinished but we aren't recording"); 1058 return; 1059 } 1060 this.current.finished(error); 1061 this.currentSyncNodeType = this.current.syncNodeType; 1062 let current = this.current; 1063 this.current = null; 1064 this.takeTelemetryRecord(current); 1065 } 1066 1067 takeTelemetryRecord(record) { 1068 // We check for "data change" before appending the current sync to payloads, 1069 // as it is the current sync which has the data with the new data, and thus 1070 // must go in the *next* submission. 1071 this.maybeSubmitForDataChange(); 1072 if (this.payloads.length < this.maxPayloadCount) { 1073 this.payloads.push(record.toJSON()); 1074 } else { 1075 ++this.discarded; 1076 } 1077 // If we are submitting due to timing, it's desirable that the most recent 1078 // sync is included, so we check after appending the record. 1079 this.maybeSubmitForInterval(); 1080 } 1081 1082 _addHistogram(hist) { 1083 let histogram = Services.telemetry.getHistogramById(hist); 1084 let s = histogram.snapshot(); 1085 this.histograms[hist] = s; 1086 } 1087 1088 _recordEvent(eventDetails) { 1089 this.maybeSubmitForDataChange(); 1090 1091 if (this.events.length >= this.maxEventsCount) { 1092 log.warn("discarding event - already queued our maximum", eventDetails); 1093 return; 1094 } 1095 1096 let { object, method, value, extra } = eventDetails; 1097 if (extra) { 1098 extra = normalizeExtraTelemetryFields(extra); 1099 eventDetails = { object, method, value, extra }; 1100 } 1101 1102 if (!validateTelemetryEvent(eventDetails)) { 1103 // we've already logged what the problem is... 1104 return; 1105 } 1106 log.debug("recording event", eventDetails); 1107 1108 if (extra && lazy.Resource.serverTime && !extra.serverTime) { 1109 extra.serverTime = String(lazy.Resource.serverTime); 1110 } 1111 let category = "sync"; 1112 let ts = Math.floor(tryGetMonotonicTimestamp()); 1113 1114 // An event record is a simple array with at least 4 items. 1115 let event = [ts, category, method, object]; 1116 // It may have up to 6 elements if |extra| is defined 1117 if (value) { 1118 event.push(value); 1119 if (extra) { 1120 event.push(extra); 1121 } 1122 } else if (extra) { 1123 event.push(null); // a null for the empty value. 1124 event.push(extra); 1125 } 1126 this.events.push(event); 1127 this.maybeSubmitForInterval(); 1128 } 1129 1130 observe(subject, topic, data) { 1131 log.trace(`observed ${topic} ${data}`); 1132 1133 switch (topic) { 1134 case "weave:service:ready": 1135 case "weave:service:login:got-hashed-id": 1136 case "fxaccounts:new_device_id": 1137 this.onAccountInitOrChange(); 1138 break; 1139 1140 case "fxaccounts:onlogout": 1141 this.onAccountLogout(); 1142 break; 1143 1144 /* sync itself state changes */ 1145 case "weave:service:sync:start": 1146 this.onSyncStarted(data); 1147 break; 1148 1149 case "weave:service:sync:finish": 1150 if (this._checkCurrent(topic)) { 1151 this.onSyncFinished(null); 1152 } 1153 break; 1154 1155 case "weave:service:sync:error": 1156 // argument needs to be truthy (this should always be the case) 1157 this.onSyncFinished(subject || "Unknown"); 1158 break; 1159 1160 /* engine sync state changes */ 1161 case "weave:engine:sync:start": 1162 if (this._checkCurrent(topic)) { 1163 this.current.onEngineStart(data); 1164 } 1165 break; 1166 case "weave:engine:sync:finish": 1167 if (this._checkCurrent(topic)) { 1168 this.current.onEngineStop(data, null); 1169 } 1170 break; 1171 1172 case "weave:engine:sync:error": 1173 if (this._checkCurrent(topic)) { 1174 // argument needs to be truthy (this should always be the case) 1175 this.current.onEngineStop(data, subject || "Unknown"); 1176 } 1177 break; 1178 1179 /* engine counts */ 1180 case "weave:engine:sync:applied": 1181 if (this._checkCurrent(topic)) { 1182 this.current.onEngineApplied(data, subject); 1183 } 1184 break; 1185 1186 case "weave:engine:sync:step": 1187 if (this._checkCurrent(topic)) { 1188 this.current.onEngineStep(data, subject); 1189 } 1190 break; 1191 1192 case "weave:engine:sync:uploaded": 1193 if (this._checkCurrent(topic)) { 1194 this.current.onEngineUploaded(data, subject); 1195 } 1196 break; 1197 1198 case "weave:engine:validate:finish": 1199 if (this._checkCurrent(topic)) { 1200 this.current.onEngineValidated(data, subject); 1201 } 1202 break; 1203 1204 case "weave:engine:validate:error": 1205 if (this._checkCurrent(topic)) { 1206 this.current.onEngineValidateError(data, subject || "Unknown"); 1207 } 1208 break; 1209 1210 case "weave:telemetry:event": 1211 case "fxa:telemetry:event": 1212 this._recordEvent(subject); 1213 break; 1214 1215 case "weave:telemetry:histogram": 1216 this._addHistogram(data); 1217 break; 1218 1219 case "weave:telemetry:migration": 1220 this._addMigrationRecord(data, subject); 1221 break; 1222 1223 default: 1224 log.warn(`unexpected observer topic ${topic}`); 1225 break; 1226 } 1227 } 1228 1229 // Transform an exception into a standard description. Exposed here for when 1230 // this module isn't directly responsible for knowing the transform should 1231 // happen (for example, when including an error in the |extra| field of 1232 // event telemetry) 1233 transformError(error) { 1234 // Certain parts of sync will use this pattern as a way to communicate to 1235 // processIncoming to abort the processing. However, there's no guarantee 1236 // this can only happen then. 1237 if (typeof error == "object" && error.code && error.cause) { 1238 error = error.cause; 1239 } 1240 if (lazy.Async.isShutdownException(error)) { 1241 return { name: "shutdownerror" }; 1242 } 1243 1244 if (typeof error === "string") { 1245 if (error.startsWith("error.")) { 1246 // This is hacky, but I can't imagine that it's not also accurate. 1247 return { name: "othererror", error }; 1248 } 1249 error = ErrorSanitizer.cleanErrorMessage(error); 1250 return { name: "unexpectederror", error }; 1251 } 1252 1253 if (error instanceof lazy.AuthenticationError) { 1254 return { name: "autherror", from: error.source }; 1255 } 1256 1257 if (DOMException.isInstance(error)) { 1258 return { 1259 name: "unexpectederror", 1260 error: ErrorSanitizer.cleanErrorMessage(error.message, error), 1261 }; 1262 } 1263 1264 let httpCode = 1265 error.status || (error.response && error.response.status) || error.code; 1266 1267 if (httpCode) { 1268 return { name: "httperror", code: httpCode }; 1269 } 1270 1271 if (error.failureCode) { 1272 return { name: "othererror", error: error.failureCode }; 1273 } 1274 1275 if (error.result) { 1276 // many "nsresult" errors are actually network errors - if they are 1277 // associated with the "network" module we assume that's true. 1278 // We also assume NS_ERROR_ABORT is such an error - for almost everything 1279 // we care about, it acually is (eg, if the connection fails early enough 1280 // or if we have a captive portal etc) - we don't lose anything by this 1281 // assumption, it's just that the error will no longer be in the "nserror" 1282 // category, so our analysis can still find them. 1283 if ( 1284 error.result == Cr.NS_ERROR_ABORT || 1285 NS_ERROR_GET_MODULE(error.result) == NS_ERROR_MODULE_NETWORK 1286 ) { 1287 return { name: "httperror", code: error.result }; 1288 } 1289 return { name: "nserror", code: error.result }; 1290 } 1291 // It's probably an Error object, but it also could be some 1292 // other object that may or may not override toString to do 1293 // something useful. 1294 let msg = String(error); 1295 if (msg.startsWith("[object")) { 1296 // Nothing useful in the default, check for a string "message" property. 1297 if (typeof error.message == "string") { 1298 msg = String(error.message); 1299 } else { 1300 // Hopefully it won't come to this... 1301 msg = JSON.stringify(error); 1302 } 1303 } 1304 return { 1305 name: "unexpectederror", 1306 error: ErrorSanitizer.cleanErrorMessage(msg), 1307 }; 1308 } 1309 } 1310 1311 export var SyncTelemetry = new SyncTelemetryImpl(ENGINES);