commit bf945545b1b289ef5ef25ce7a81b1e33e0af200f parent ea950bff4a837e43abe7d05fbaae5da0a4f9ec45 Author: Alex Franchuk <afranchuk@mozilla.com> Date: Wed, 8 Oct 2025 20:16:08 +0000 Bug 1993330 p2 - Revert Bug 1962160 to restore the Legacy Telemetry crash ping r=gstoll This reverts commit b7987d4265b06d4cc552dbd3ac98ad939325abe7. Differential Revision: https://phabricator.services.mozilla.com/D268024 Diffstat:
17 files changed, 913 insertions(+), 321 deletions(-)
diff --git a/toolkit/components/crashes/CrashManager.in.sys.mjs b/toolkit/components/crashes/CrashManager.in.sys.mjs @@ -9,6 +9,7 @@ const lazy = {}; ChromeUtils.defineESModuleGetters(lazy, { Log: "resource://gre/modules/Log.sys.mjs", + TelemetryController: "resource://gre/modules/TelemetryController.sys.mjs", }); /** @@ -48,6 +49,31 @@ function getAndRemoveField(obj, field) { } /** + * Parse the string stored in the specified field as JSON and then remove the + * field from the object. + * + * @param obj {Object} The object holding the field + * @param field {String} The name of the field to be parsed and removed + * + * @returns {Object} the parsed object, null if none was found + */ +function parseAndRemoveField(obj, field) { + let value = null; + + if (field in obj) { + try { + value = JSON.parse(obj[field]); + } catch (e) { + console.error(e); + } + + delete obj[field]; + } + + return value; +} + +/** * Convert a legacy Telemetry `StackTraces` layout to that expected by Glean. * * @param stackTraces {Object} The legacy Telemetry StackTraces object. @@ -177,6 +203,9 @@ export var CrashManager = function (options) { // Map of crash ID / promise tuples used to track adding new crashes. this._crashPromises = new Map(); + // Promise for the crash ping used only for testing. + this._pingPromise = null; + // The CrashStore currently attached to this object. this._store = null; @@ -929,16 +958,20 @@ CrashManager.prototype = Object.freeze({ // If we have a saved environment, use it. Otherwise report // the current environment. let reportMeta = Cu.cloneInto(metadata, {}); - - // Delete unused fields from legacy telemetry - delete reportMeta.TelemetryEnvironment; - delete reportMeta.TelemetrySessionId; - + let crashEnvironment = parseAndRemoveField( + reportMeta, + "TelemetryEnvironment" + ); + let sessionId = getAndRemoveField(reportMeta, "TelemetrySessionId"); let stackTraces = getAndRemoveField(reportMeta, "StackTraces"); let minidumpSha256Hash = getAndRemoveField( reportMeta, "MinidumpSha256Hash" ); + // If CrashPingUUID is present then a Telemetry ping was generated by the + // crashreporter for this crash so we only need to send the Glean ping. + let onlyGlean = getAndRemoveField(reportMeta, "CrashPingUUID"); + // Filter the remaining annotations to remove privacy-sensitive ones reportMeta = this._filterAnnotations(reportMeta); @@ -955,6 +988,31 @@ CrashManager.prototype = Object.freeze({ reportMeta ); } + + if (onlyGlean) { + return; + } + + this._pingPromise = lazy.TelemetryController.submitExternalPing( + "crash", + { + version: 1, + crashDate: date.toISOString().slice(0, 10), // YYYY-MM-DD + crashTime: date.toISOString().slice(0, 13) + ":00:00.000Z", // per-hour resolution + sessionId, + crashId, + minidumpSha256Hash, + processType: type, + stackTraces, + metadata: reportMeta, + hasCrashEnvironment: crashEnvironment !== null, + }, + { + addClientId: true, + addEnvironment: true, + overrideEnvironment: crashEnvironment, + } + ); }, _handleEventFilePayload(store, entry, type, date, payload) { diff --git a/toolkit/components/crashes/docs/index.rst b/toolkit/components/crashes/docs/index.rst @@ -23,10 +23,16 @@ provided externally. Crash Pings =========== -The Crash Manager is responsible for sending crash pings when a crash occurs or when a crash event -is found. Crash pings are sent using `Glean pings <../../glean/index.html>`__. The Glean ``crash`` -ping can be found `here -<https://dictionary.telemetry.mozilla.org/apps/firefox_desktop/pings/crash>`__. +The Crash Manager is responsible for sending crash pings when a crash occurs +or when a crash event is found. Crash pings are sent using +`Telemetry pings <../../telemetry/data/crash-ping.html>`__. + +Glean +----- +Crash pings have been migrated to use `Glean pings <../../glean/index.html>`__, however +the Telemetry pings will still be sent until we are confident that the Glean +implementation is robust. The Glean `crash` ping can be found +`here <https://dictionary.telemetry.mozilla.org/apps/firefox_desktop/pings/crash>`__. See `bug 1784069 <https://bugzilla.mozilla.org/show_bug.cgi?id=1784069>`_ for details. diff --git a/toolkit/components/crashes/tests/xpcshell/test_crash_manager.js b/toolkit/components/crashes/tests/xpcshell/test_crash_manager.js @@ -296,14 +296,6 @@ add_task(async function test_main_crash_event_file() { crashId, metadata ); - - let pingSubmitted = false; - GleanPings.crash.testBeforeNextSubmit(_ => { - const MINUTES = new Date(DUMMY_DATE); - Assert.equal(Glean.crash.time.testGetValue().getTime(), MINUTES.getTime()); - pingSubmitted = true; - }); - let count = await m.aggregateEventsFiles(); Assert.equal(count, 1); @@ -319,7 +311,26 @@ add_task(async function test_main_crash_event_file() { Assert.ok(crashes[0].metadata.StackTraces); Assert.deepEqual(crashes[0].crashDate, DUMMY_DATE); - Assert.ok(pingSubmitted, "ping submitted for found crash"); + let found = await ac.promiseFindPing("crash", [ + [["payload", "hasCrashEnvironment"], true], + [["payload", "metadata", "ProductName"], productName], + [["payload", "metadata", "ProductID"], productId], + [["payload", "minidumpSha256Hash"], sha256Hash], + [["payload", "crashId"], crashId], + [["payload", "stackTraces", "status"], "OK"], + [["payload", "sessionId"], sessionId], + ]); + Assert.ok(found, "Telemetry ping submitted for found crash"); + Assert.deepEqual( + found.environment, + theEnvironment, + "The saved environment should be present" + ); + Assert.equal( + found.payload.metadata.TestKey, + undefined, + "Non-allowed fields should be filtered out" + ); count = await m.aggregateEventsFiles(); Assert.equal(count, 0); @@ -341,14 +352,6 @@ add_task(async function test_main_crash_event_file_noenv() { crashId, metadata ); - - let pingSubmitted = false; - GleanPings.crash.testBeforeNextSubmit(_ => { - const MINUTES = new Date(DUMMY_DATE); - Assert.equal(Glean.crash.time.testGetValue().getTime(), MINUTES.getTime()); - pingSubmitted = true; - }); - let count = await m.aggregateEventsFiles(); Assert.equal(count, 1); @@ -362,7 +365,13 @@ add_task(async function test_main_crash_event_file_noenv() { }); Assert.deepEqual(crashes[0].crashDate, DUMMY_DATE); - Assert.ok(pingSubmitted, "ping submitted for found crash"); + let found = await ac.promiseFindPing("crash", [ + [["payload", "hasCrashEnvironment"], false], + [["payload", "metadata", "ProductName"], productName], + [["payload", "metadata", "ProductID"], productId], + ]); + Assert.ok(found, "Telemetry ping submitted for found crash"); + Assert.ok(found.environment, "There is an environment"); count = await m.aggregateEventsFiles(); Assert.equal(count, 0); @@ -687,6 +696,101 @@ add_task(async function test_addCrash() { ); }); +add_task(async function test_child_process_crash_ping() { + let m = await getManager(); + const EXPECTED_PROCESSES = [ + m.processTypes[Ci.nsIXULRuntime.PROCESS_TYPE_DEFAULT], + m.processTypes[Ci.nsIXULRuntime.PROCESS_TYPE_CONTENT], + m.processTypes[Ci.nsIXULRuntime.PROCESS_TYPE_GMPLUGIN], + m.processTypes[Ci.nsIXULRuntime.PROCESS_TYPE_GPU], + m.processTypes[Ci.nsIXULRuntime.PROCESS_TYPE_VR], + m.processTypes[Ci.nsIXULRuntime.PROCESS_TYPE_RDD], + m.processTypes[Ci.nsIXULRuntime.PROCESS_TYPE_SOCKET], + m.processTypes[Ci.nsIXULRuntime.PROCESS_TYPE_FORKSERVER], + m.processTypes[Ci.nsIXULRuntime.PROCESS_TYPE_UTILITY], + ]; + + const UNEXPECTED_PROCESSES = [ + m.processTypes[Ci.nsIXULRuntime.PROCESS_TYPE_IPDLUNITTEST], + null, + 12, // non-string process type + ]; + + let ac = new TelemetryArchiveTesting.Checker(); + await ac.promiseInit(); + + // Add a child-process crash for each allowed process type. + for (let p of EXPECTED_PROCESSES) { + // Generate a ping. + const remoteType = + p === m.processTypes[Ci.nsIXULRuntime.PROCESS_TYPE_CONTENT] + ? "web" + : undefined; + let id = await m.createDummyDump(); + await m.addCrash(p, m.CRASH_TYPE_CRASH, id, DUMMY_DATE, { + RemoteType: remoteType, + StackTraces: stackTraces, + MinidumpSha256Hash: sha256Hash, + ipc_channel_error: "ShutDownKill", + TestKey: "this-should-not-end-up-in-the-ping", + }); + await m._pingPromise; + + let found = await ac.promiseFindPing("crash", [ + [["payload", "crashId"], id], + [["payload", "minidumpSha256Hash"], sha256Hash], + [["payload", "processType"], p], + [["payload", "stackTraces", "status"], "OK"], + ]); + Assert.ok(found, "Telemetry ping submitted for " + p + " crash"); + + let hoursOnly = new Date(DUMMY_DATE); + hoursOnly.setSeconds(0); + hoursOnly.setMinutes(0); + Assert.equal( + new Date(found.payload.crashTime).getTime(), + hoursOnly.getTime() + ); + + Assert.equal( + found.payload.metadata.TestKey, + undefined, + "Non-allowed fields should be filtered out" + ); + Assert.equal( + found.payload.metadata.RemoteType, + remoteType, + "RemoteType should be allowed for content crashes" + ); + Assert.equal( + found.payload.metadata.ipc_channel_error, + "ShutDownKill", + "ipc_channel_error should be allowed for content crashes" + ); + } + + // Check that we don't generate a crash ping for invalid/unexpected process + // types. + for (let p of UNEXPECTED_PROCESSES) { + let id = await m.createDummyDump(); + await m.addCrash(p, m.CRASH_TYPE_CRASH, id, DUMMY_DATE, { + StackTraces: stackTraces, + MinidumpSha256Hash: sha256Hash, + TestKey: "this-should-not-end-up-in-the-ping", + }); + await m._pingPromise; + + // Check that we didn't receive any new ping. + let found = await ac.promiseFindPing("crash", [ + [["payload", "crashId"], id], + ]); + Assert.ok( + !found, + "No telemetry ping must be submitted for invalid process types" + ); + } +}); + add_task(async function test_glean_crash_ping() { let m = await getManager(); @@ -1160,3 +1264,37 @@ add_task(async function test_telemetryHistogram() { "Some crash types do not match" ); }); + +// Test that a ping with `CrashPingUUID` in the metadata (as set by the +// external crash reporter) is sent with Glean but not with Telemetry (because +// the crash reporter already sends it using Telemetry). +add_task(async function test_crash_reporter_ping_with_uuid() { + let m = await getManager(); + + let id = await m.createDummyDump(); + + // Realistically this case will only happen through + // `_handleEventFilePayload`, however the `_sendCrashPing` method will check + // for it regardless of where it is called. + let metadata = { CrashPingUUID: "bff6bde4-f96c-4859-8c56-6b3f40878c26" }; + + // Glean hooks + let glean_submitted = false; + GleanPings.crash.testBeforeNextSubmit(_ => { + glean_submitted = true; + }); + + await m.addCrash( + m.processTypes[Ci.nsIXULRuntime.PROCESS_TYPE_CONTENT], + m.CRASH_TYPE_CRASH, + id, + DUMMY_DATE, + metadata + ); + + // Ping promise is only set if the Telemetry ping is submitted. + let telemetry_submitted = !!m._pingPromise; + + Assert.ok(glean_submitted); + Assert.ok(!telemetry_submitted); +}); diff --git a/toolkit/components/telemetry/docs/concepts/crashes.rst b/toolkit/components/telemetry/docs/concepts/crashes.rst @@ -8,7 +8,7 @@ Main process crashes ==================== If the Firefox main process dies, that should be recorded as an aborted session. We would submit a :doc:`main ping <../data/main-ping>` with the reason ``aborted-session``. -If we have a crash dump for that crash, we should also submit a :doc:`crash ping <../../crashes/crash-manager/crash-ping-lifecycle>`. +If we have a crash dump for that crash, we should also submit a :doc:`crash ping <../data/crash-ping>`. The ``aborted-session`` information is first written to disk 60 seconds after startup, any earlier crashes will not trigger an ``aborted-session`` ping. Also, the ``aborted-session`` is updated at least every 5 minutes, so it may lag behind the last session state. @@ -20,4 +20,4 @@ If a Firefox plugin, content, gmplugin, or any other type of child process dies If we catch a crash report for this, then additionally the ``SUBPROCESS_CRASHES_WITH_DUMP`` keyed histogram (glean subprocess:crashes_with_dump metric) is incremented. -Some processes also generate :doc:`crash pings <../../crashes/crash-manager/crash-ping-lifecycle>` when they crash and generate a crash dump. See `bug 1352496 <https://bugzilla.mozilla.org/show_bug.cgi?id=1352496>`_ for an example of how to allow crash pings for new process types. +Some processes also generate :doc:`crash pings <../data/crash-ping>` when they crash and generate a crash dump. See `bug 1352496 <https://bugzilla.mozilla.org/show_bug.cgi?id=1352496>`_ for an example of how to allow crash pings for new process types. diff --git a/toolkit/components/telemetry/docs/concepts/pings.rst b/toolkit/components/telemetry/docs/concepts/pings.rst @@ -23,6 +23,7 @@ Important examples are: * :doc:`main <../data/main-ping>` - contains the information collected by Telemetry (Histograms, Scalars, ...) * :doc:`saved-session <../data/main-ping>` - has the same format as a main ping, but it contains the *"classic"* Telemetry payload with measurements covering the whole browser session. This is only a separate type to make storage of saved-session easier server-side. As of Firefox 61 this is sent on Android only. +* :doc:`crash <../data/crash-ping>` - a ping that is captured and sent after a Firefox process crashes. * :doc:`new-profile <../data/new-profile-ping>` - sent on the first run of a new profile. * :doc:`update <../data/update-ping>` - sent right after an update is downloaded. * :doc:`deletion-request <../data/deletion-request-ping>` - sent when FHR upload is disabled diff --git a/toolkit/components/telemetry/docs/concepts/submission.rst b/toolkit/components/telemetry/docs/concepts/submission.rst @@ -28,7 +28,7 @@ Any new ping submissions and "idle-daily" events reset this behavior as a safety Pingsender ========== -Some pings (e.g. :doc:`main pings <../data/main-ping>` with reason `shutdown`) are submitted using the :doc:`../internals/pingsender`. +Some pings (e.g. :doc:`crash pings <../data/crash-ping>` and :doc:`main pings <../data/main-ping>` with reason `shutdown`) are submitted using the :doc:`../internals/pingsender`. The pingsender tries to send each ping once and, if it fails, no additional attempt is performed: ``TelemetrySend`` will take care of retrying using the previously described submission logic. diff --git a/toolkit/components/telemetry/docs/data/crash-ping.rst b/toolkit/components/telemetry/docs/data/crash-ping.rst @@ -0,0 +1,270 @@ +Crash ping +========== + +This ping is captured after the main Firefox process crashes or after a child process +process crashes, whether or not the crash report is submitted to +crash-stats.mozilla.org. It includes non-identifying metadata about the crash. + +.. warning:: + The Telemetry crash ping will be deprecated and eventually removed. It is being + replaced by the Glean crash ping. See `bug 1784069 <https://bugzilla.mozilla.org/show_bug.cgi?id=1784069>`_. + Please be sure to mirror any changes made to the Telemetry ping! + +This ping is sent either by the ``CrashManager`` or by the crash reporter +client. The ``CrashManager`` is responsible for sending crash pings for the +child processes crashes, which are sent right after the crash is detected, +as well as for main process crashes, which are sent after Firefox restarts +successfully. The crash reporter client sends crash pings only for main process +crashes whether or not the user also reports the crash. The crash reporter +client will not send the crash ping if telemetry has been disabled in Firefox. + +The environment block that is sent with this ping varies: if Firefox was running +long enough to record the environment block before the crash, then the environment +at the time of the crash will be recorded and ``hasCrashEnvironment`` will be true. +If Firefox crashed before the environment was recorded, ``hasCrashEnvironment`` will +be false and the recorded environment will be the environment at time of submission. + +The client ID and profile group ID are submitted with this ping. + +The metadata field holds a subset of the crash annotations, all field values +are stored as strings but some may be interpreted either as numbers or +boolean values. Numbers are integral unless stated otherwise in the +description. Boolean values are set to "1" when true, "0" when false. If +they're absent then they should be interpreted as false. + +Structure: + +.. code-block:: js + + { + type: "crash", + ... common ping data + clientId: <UUID>, + profileGroupId: <UUID>, + environment: { ... }, + payload: { + crashDate: "YYYY-MM-DD", + crashTime: <ISO Date>, // per-hour resolution + version: 1, + sessionId: <UUID>, // Telemetry ID of crashing session. May be missing for crashes that happen early in startup + crashId: <UUID>, // Optional, ID of the associated crash + minidumpSha256Hash: <hash>, // SHA256 hash of the minidump file + processType: <type>, // Type of process that crashed, see below for a list of types + stackTraces: { ... }, // Optional, see below + metadata: { // Annotations saved while Firefox was running. See CrashAnnotations.yaml for more information + ProductID: "{ec8030f7-c20a-464f-9b0e-13a3a9e97384}", + ProductName: "Firefox", + ReleaseChannel: <channel>, + Version: <version number>, + BuildID: "YYYYMMDDHHMMSS", + AsyncShutdownTimeout: <json>, // Optional, present when a shutdown blocker failed to respond within a reasonable amount of time + AvailablePageFile: <size>, // Windows-only, available paging file in bytes + AvailablePhysicalMemory: <size>, // Windows-only, available physical memory in bytes + AvailableSwapMemory: <size>, // macOS- and Linux-only, available swap space + AvailableVirtualMemory: <size>, // Windows-only, available virtual memory in bytes + BackgroundTaskName: "task_name", // Optional, if the app was invoked in background task mode via `--backgroundtask task_name` + BlockedDllList: <list>, // Windows-only, see WindowsDllBlocklist.cpp for details + BlocklistInitFailed: "1", // Windows-only, present only if the DLL blocklist initialization failed + CrashTime: <time>, // Seconds since the Epoch + DOMFissionEnabled: "1", // Optional, if set indicates that a Fission window had been opened + EventLoopNestingLevel: <levels>, // Optional, present only if >0, indicates the nesting level of the event-loop + FontName: <name>, // Optional, the font family name that is being loaded when the crash occurred + GPUProcessLaunchCount: <num>, // Number of times the GPU process was launched + HeadlessMode: "1", // Optional, "1" if the app was invoked in headless mode via `--headless ...` or `--backgroundtask ...` + ipc_channel_error: <error string>, // Optional, contains the string processing error reason for an ipc-based content crash + IsGarbageCollecting: "1", // Optional, if set indicates that the crash occurred while the garbage collector was running + LowCommitSpaceEvents: <num>, // Windows-only, present only if >0, number of low commit space events detected by the available memory tracker + MainThreadRunnableName: <name>, // Optional, Nightly-only, name of the currently executing nsIRunnable on the main thread + MozCrashReason: <reason>, // Optional, contains the string passed to MOZ_CRASH() + NimbusEnrollments: <enrollments>, // Optional, a comma-separated string that specifies the active Nimbus experiments and rollouts, as well as their branches. + OOMAllocationSize: <size>, // Size of the allocation that caused an OOM + ProfilerChildShutdownPhase: <string>, // Profiler shutdown phase + PurgeablePhysicalMemory: <size>, // macOS-only, amount of memory that can be deallocated by the OS in case of memory pressure + QuotaManagerShutdownTimeout: <log-string>, // Optional, contains a list of shutdown steps and status of the quota manager clients + RemoteType: <type>, // Optional, type of content process, see below for a list of types + SecondsSinceLastCrash: <duration>, // Seconds elapsed since the last crash occurred + ShutdownProgress: <phase>, // Optional, contains a string describing the shutdown phase in which the crash occurred + SystemMemoryUsePercentage: <percentage>, // Windows-only, percent of memory in use + StartupCrash: "1", // Optional, if set indicates that Firefox crashed during startup + TextureUsage: <usage>, // Optional, usage of texture memory in bytes + TotalPageFile: <size>, // Windows-only, paging file in use expressed in bytes + TotalPhysicalMemory: <size>, // Windows-only, physical memory in use expressed in bytes + TotalVirtualMemory: <size>, // Windows-only, virtual memory in use expressed in bytes + UptimeTS: <duration>, // Seconds since Firefox was started, this can have a fractional component + User32BeforeBlocklist: "1", // Windows-only, present only if user32.dll was loaded before the DLL blocklist has been initialized + WindowsErrorReporting: "1", // Windows-only, present only if the crash was intercepted by the WER runtime exception module + WindowsFileDialogErrorCode: <error code>, // Windows-only, optional, present only if file-dialog IPC failed + WindowsPackageFamilyName: <string>, // Windows-only, a string containing the "Package Family Name" of Firefox, if installed through an MSIX package + }, + hasCrashEnvironment: bool + } + } + +.. note:: + + For "crash" pings generated by the crashreporter we are deliberately truncating the ``creationTime`` + field to hours. See `bug 1345108 <https://bugzilla.mozilla.org/show_bug.cgi?id=1345108>`_ for context. + +Process Types +------------- + +The ``processType`` field contains the type of process that crashed. There are +currently multiple process types defined in ``nsICrashService`` but crash pings +are sent only for the ones below: + ++---------------+-------------------------------------------------------------------------------+ +| Type | Description | ++===============+===============================================================================+ +| main | :ref:`Main process <parent-process>`, also known as parent or browser process | ++---------------+-------------------------------------------------------------------------------+ +| content | :ref:`Content process <content-process>` | ++---------------+-------------------------------------------------------------------------------+ +| gmplugin | :ref:`Gecko media plugin <gecko-media-plugins-process>` | ++---------------+-------------------------------------------------------------------------------+ +| gpu | :ref:`GPU process <gpu-process>` | ++---------------+-------------------------------------------------------------------------------+ +| vr | :ref:`VR process <vr-process>` | ++---------------+-------------------------------------------------------------------------------+ +| rdd | :ref:`Data decoder process <data-decoder-process>` | ++---------------+-------------------------------------------------------------------------------+ +| socket | :ref:`Network socket process <network-socket-process>` | ++---------------+-------------------------------------------------------------------------------+ +| forkserver | :ref:`Fork server <fork-server>` | ++---------------+-------------------------------------------------------------------------------+ +| utility | :ref:`Utility process <utility-process>` | ++---------------+-------------------------------------------------------------------------------+ + +.. _remote-process-types: + +Remote Process Types +-------------------- + +The optional ``remoteType`` field contains the type of the content process that +crashed. As such it is present only if ``processType`` contains the ``content`` +value. The following content process types are currently defined: + ++-----------+--------------------------------------------------------+ +| Type | Description | ++===========+========================================================+ +| web | The content process was running code from a web page | ++-----------+--------------------------------------------------------+ +| file | The content process was running code from a local file | ++-----------+--------------------------------------------------------+ +| extension | The content process was running code from an extension | ++-----------+--------------------------------------------------------+ + +Stack Traces +------------ + +The crash ping may contain a ``stackTraces`` field which has been populated +with stack traces for all threads in the crashed process. The format of this +field is similar to the one used by Socorro for representing a crash. The main +differences are that redundant fields are not stored and that the module a +frame belongs to is referenced by index in the module array rather than by its +file name. + +Note that this field does not contain data from the application; only bare +stack traces and module lists are stored. + +.. code-block:: js + + { + status: <string>, // Status of the analysis, "OK" or an error message + crash_info: { // Basic crash information + type: <string>, // Type of crash, SIGSEGV, assertion, etc... + address: <addr>, // Crash address crash, hex format, see the notes below + crashing_thread: <index> // Index in the thread array below + }, + main_module: <index>, // Index of Firefox' executable in the module list + modules: [{ + base_addr: <addr>, // Base address of the module, hex format + end_addr: <addr>, // End address of the module, hex format + code_id: <string>, // Unique ID of this module, see the notes below + debug_file: <string>, // Name of the file holding the debug information + debug_id: <string>, // ID or hash of the debug information file + filename: <string>, // File name + version: <string>, // Library/executable version + }, + ... // List of modules ordered by base memory address + ], + threads: [{ // Stack traces for every thread + frames: [{ + module_index: <index>, // Index of the module this frame belongs to + ip: <ip>, // Program counter, hex format + trust: <string> // Trust of this frame, see the notes below + }, + ... // List of frames, the first frame is the topmost + ] + }] + } + +Notes +~~~~~ + +Memory addresses and instruction pointers are always stored as strings in +hexadecimal format (e.g. "0x4000"). They can be made of up to 16 characters for +64-bit addresses. + +The crash type is both OS and CPU dependent and can be either a descriptive +string (e.g. SIGSEGV, EXCEPTION_ACCESS_VIOLATION) or a raw numeric value. The +crash address meaning depends on the type of crash. In a segmentation fault the +crash address will be the memory address whose access caused the fault; in a +crash triggered by an illegal instruction exception the address will be the +instruction pointer where the invalid instruction resides. +See `breakpad <https://chromium.googlesource.com/breakpad/breakpad/+/c99d374dde62654a024840accfb357b2851daea0/src/processor/minidump_processor.cc#675>`__'s +relevant code for further information. + +Since it's not always possible to establish with certainty the address of the +previous frame while walking the stack, every frame has a trust value that +represents how it was found and thus how certain we are that it's a real frame. +The trust levels are (from least trusted to most trusted): + ++---------------+---------------------------------------------------+ +| Trust | Description | ++===============+===================================================+ +| context | Given as instruction pointer in a context | ++---------------+---------------------------------------------------+ +| prewalked | Explicitly provided by some external stack walker | ++---------------+---------------------------------------------------+ +| cfi | Derived from call frame info | ++---------------+---------------------------------------------------+ +| frame_pointer | Derived from frame pointer | ++---------------+---------------------------------------------------+ +| cfi_scan | Found while scanning stack using call frame info | ++---------------+---------------------------------------------------+ +| scan | Scanned the stack, found this | ++---------------+---------------------------------------------------+ +| none | Unknown, this is most likely not a valid frame | ++---------------+---------------------------------------------------+ + +The ``code_id`` field holds a unique ID used to distinguish between different +versions and builds of the same module. See `breakpad <https://chromium.googlesource.com/breakpad/breakpad/+/24f5931c5e0120982c0cbf1896641e3ef2bdd52f/src/google_breakpad/processor/code_module.h#60>`__'s +description for further information. This field is populated only on Windows. + +Version History +--------------- + +- Firefox 58: Added ipc_channel_error (`bug 1410143 <https://bugzilla.mozilla.org/show_bug.cgi?id=1410143>`_). +- Firefox 62: Added LowCommitSpaceEvents (`bug 1464773 <https://bugzilla.mozilla.org/show_bug.cgi?id=1464773>`_). +- Firefox 63: Added RecordReplayError (`bug 1481009 <https://bugzilla.mozilla.org/show_bug.cgi?id=1481009>`_). +- Firefox 64: Added MemoryErrorCorrection (`bug 1498609 <https://bugzilla.mozilla.org/show_bug.cgi?id=1498609>`_). +- Firefox 68: Added IndexedDBShutdownTimeout and LocalStorageShutdownTimeout + (`bug 1539750 <https://bugzilla.mozilla.org/show_bug.cgi?id=1539750>`_). +- Firefox 74: Added AvailableSwapMemory and PurgeablePhysicalMemory + (`bug 1587721 <https://bugzilla.mozilla.org/show_bug.cgi?id=1587721>`_). +- Firefox 74: Added MainThreadRunnableName (`bug 1608158 <https://bugzilla.mozilla.org/show_bug.cgi?id=1608158>`_). +- Firefox 76: Added DOMFissionEnabled (`bug 1602918 <https://bugzilla.mozilla.org/show_bug.cgi?id=1602918>`_). +- Firefox 79: Added ExperimentalFeatures (`bug 1644544 <https://bugzilla.mozilla.org/show_bug.cgi?id=1644544>`_). +- Firefox 85: Added QuotaManagerShutdownTimeout, removed IndexedDBShutdownTimeout and LocalStorageShutdownTimeout + (`bug 1672369 <https://bugzilla.mozilla.org/show_bug.cgi?id=1672369>`_). +- Firefox 89: Added GPUProcessLaunchCount (`bug 1710448 <https://bugzilla.mozilla.org/show_bug.cgi?id=1710448>`_) + and ProfilerChildShutdownPhase (`bug 1704680 <https://bugzilla.mozilla.org/show_bug.cgi?id=1704680>`_). +- Firefox 90: Removed MemoryErrorCorrection (`bug 1710152 <https://bugzilla.mozilla.org/show_bug.cgi?id=1710152>`_) + and added WindowsErrorReporting (`bug 1703761 <https://bugzilla.mozilla.org/show_bug.cgi?id=1703761>`_). +- Firefox 95: Added HeadlessMode and BackgroundTaskName (`bug 1697875 <https://bugzilla.mozilla.org/show_bug.cgi?id=1697875>`_). +- Firefox 96: Added WindowsPackageFamilyName (`bug 1738375 <https://bugzilla.mozilla.org/show_bug.cgi?id=1738375>`_). +- Firefox 103: Removed ContainsMemoryReport (`bug 1776279 <https://bugzilla.mozilla.org/show_bug.cgi?id=1776279>`_). +- Firefox 107: Added UtilityActorsName (`bug 1788596 <https://bugzilla.mozilla.org/show_bug.cgi?id=1788596>`_). +- Firefox 119: Added WindowsFileDialogErrorCode (`bug 1837079 <https://bugzilla.mozilla.org/show_bug.cgi?id=1837079>`_) +- Firefox 137: Added NimbusEnrollments (`bug 1950661 <https://bugzilla.mozilla.org/show_bug.cgi?id=1950661>`_). +- Firefox 138: Removed ExperimentalFeatures (`bug 1942694 <https://bugzilla.mozilla.org/show_bug.cgi?id=1942694>`_). diff --git a/toolkit/components/telemetry/docs/data/index.rst b/toolkit/components/telemetry/docs/data/index.rst @@ -10,6 +10,7 @@ Data documentation common-ping environment main-ping + crash-ping backgroundhangmonitor-ping anonymous-ping first-shutdown-ping diff --git a/toolkit/components/telemetry/docs/obsolete/crash-ping.rst b/toolkit/components/telemetry/docs/obsolete/crash-ping.rst @@ -1,271 +0,0 @@ -Crash ping -========== - -.. warning:: - The Desktop Telemetry crash ping is no longer sent. Please see the `Glean crash ping definition - <https://dictionary.telemetry.mozilla.org/apps/firefox_desktop/pings/crash>`_, which describes - the metrics included in the Glean crash ping. - -This ping is captured after the main Firefox process crashes or after a child process -process crashes, whether or not the crash report is submitted to -crash-stats.mozilla.org. It includes non-identifying metadata about the crash. - - -This ping is sent either by the ``CrashManager`` or by the crash reporter -client. The ``CrashManager`` is responsible for sending crash pings for the -child processes crashes, which are sent right after the crash is detected, -as well as for main process crashes, which are sent after Firefox restarts -successfully. The crash reporter client sends crash pings only for main process -crashes whether or not the user also reports the crash. The crash reporter -client will not send the crash ping if telemetry has been disabled in Firefox. - -The environment block that is sent with this ping varies: if Firefox was running -long enough to record the environment block before the crash, then the environment -at the time of the crash will be recorded and ``hasCrashEnvironment`` will be true. -If Firefox crashed before the environment was recorded, ``hasCrashEnvironment`` will -be false and the recorded environment will be the environment at time of submission. - -The client ID and profile group ID are submitted with this ping. - -The metadata field holds a subset of the crash annotations, all field values -are stored as strings but some may be interpreted either as numbers or -boolean values. Numbers are integral unless stated otherwise in the -description. Boolean values are set to "1" when true, "0" when false. If -they're absent then they should be interpreted as false. - -Structure: - -.. code-block:: js - - { - type: "crash", - ... common ping data - clientId: <UUID>, - profileGroupId: <UUID>, - environment: { ... }, - payload: { - crashDate: "YYYY-MM-DD", - crashTime: <ISO Date>, // per-hour resolution - version: 1, - sessionId: <UUID>, // Telemetry ID of crashing session. May be missing for crashes that happen early in startup - crashId: <UUID>, // Optional, ID of the associated crash - minidumpSha256Hash: <hash>, // SHA256 hash of the minidump file - processType: <type>, // Type of process that crashed, see below for a list of types - stackTraces: { ... }, // Optional, see below - metadata: { // Annotations saved while Firefox was running. See CrashAnnotations.yaml for more information - ProductID: "{ec8030f7-c20a-464f-9b0e-13a3a9e97384}", - ProductName: "Firefox", - ReleaseChannel: <channel>, - Version: <version number>, - BuildID: "YYYYMMDDHHMMSS", - AsyncShutdownTimeout: <json>, // Optional, present when a shutdown blocker failed to respond within a reasonable amount of time - AvailablePageFile: <size>, // Windows-only, available paging file in bytes - AvailablePhysicalMemory: <size>, // Windows-only, available physical memory in bytes - AvailableSwapMemory: <size>, // macOS- and Linux-only, available swap space - AvailableVirtualMemory: <size>, // Windows-only, available virtual memory in bytes - BackgroundTaskName: "task_name", // Optional, if the app was invoked in background task mode via `--backgroundtask task_name` - BlockedDllList: <list>, // Windows-only, see WindowsDllBlocklist.cpp for details - BlocklistInitFailed: "1", // Windows-only, present only if the DLL blocklist initialization failed - CrashTime: <time>, // Seconds since the Epoch - DOMFissionEnabled: "1", // Optional, if set indicates that a Fission window had been opened - EventLoopNestingLevel: <levels>, // Optional, present only if >0, indicates the nesting level of the event-loop - FontName: <name>, // Optional, the font family name that is being loaded when the crash occurred - GPUProcessLaunchCount: <num>, // Number of times the GPU process was launched - HeadlessMode: "1", // Optional, "1" if the app was invoked in headless mode via `--headless ...` or `--backgroundtask ...` - ipc_channel_error: <error string>, // Optional, contains the string processing error reason for an ipc-based content crash - IsGarbageCollecting: "1", // Optional, if set indicates that the crash occurred while the garbage collector was running - LowCommitSpaceEvents: <num>, // Windows-only, present only if >0, number of low commit space events detected by the available memory tracker - MainThreadRunnableName: <name>, // Optional, Nightly-only, name of the currently executing nsIRunnable on the main thread - MozCrashReason: <reason>, // Optional, contains the string passed to MOZ_CRASH() - NimbusEnrollments: <enrollments>, // Optional, a comma-separated string that specifies the active Nimbus experiments and rollouts, as well as their branches. - OOMAllocationSize: <size>, // Size of the allocation that caused an OOM - ProfilerChildShutdownPhase: <string>, // Profiler shutdown phase - PurgeablePhysicalMemory: <size>, // macOS-only, amount of memory that can be deallocated by the OS in case of memory pressure - QuotaManagerShutdownTimeout: <log-string>, // Optional, contains a list of shutdown steps and status of the quota manager clients - RemoteType: <type>, // Optional, type of content process, see below for a list of types - SecondsSinceLastCrash: <duration>, // Seconds elapsed since the last crash occurred - ShutdownProgress: <phase>, // Optional, contains a string describing the shutdown phase in which the crash occurred - SystemMemoryUsePercentage: <percentage>, // Windows-only, percent of memory in use - StartupCrash: "1", // Optional, if set indicates that Firefox crashed during startup - TextureUsage: <usage>, // Optional, usage of texture memory in bytes - TotalPageFile: <size>, // Windows-only, paging file in use expressed in bytes - TotalPhysicalMemory: <size>, // Windows-only, physical memory in use expressed in bytes - TotalVirtualMemory: <size>, // Windows-only, virtual memory in use expressed in bytes - UptimeTS: <duration>, // Seconds since Firefox was started, this can have a fractional component - User32BeforeBlocklist: "1", // Windows-only, present only if user32.dll was loaded before the DLL blocklist has been initialized - WindowsErrorReporting: "1", // Windows-only, present only if the crash was intercepted by the WER runtime exception module - WindowsFileDialogErrorCode: <error code>, // Windows-only, optional, present only if file-dialog IPC failed - WindowsPackageFamilyName: <string>, // Windows-only, a string containing the "Package Family Name" of Firefox, if installed through an MSIX package - }, - hasCrashEnvironment: bool - } - } - -.. note:: - - For "crash" pings generated by the crashreporter we are deliberately truncating the ``creationTime`` - field to hours. See `bug 1345108 <https://bugzilla.mozilla.org/show_bug.cgi?id=1345108>`_ for context. - -Process Types -------------- - -The ``processType`` field contains the type of process that crashed. There are -currently multiple process types defined in ``nsICrashService`` but crash pings -are sent only for the ones below: - -+---------------+-------------------------------------------------------------------------------+ -| Type | Description | -+===============+===============================================================================+ -| main | :ref:`Main process <parent-process>`, also known as parent or browser process | -+---------------+-------------------------------------------------------------------------------+ -| content | :ref:`Content process <content-process>` | -+---------------+-------------------------------------------------------------------------------+ -| gmplugin | :ref:`Gecko media plugin <gecko-media-plugins-process>` | -+---------------+-------------------------------------------------------------------------------+ -| gpu | :ref:`GPU process <gpu-process>` | -+---------------+-------------------------------------------------------------------------------+ -| vr | :ref:`VR process <vr-process>` | -+---------------+-------------------------------------------------------------------------------+ -| rdd | :ref:`Data decoder process <data-decoder-process>` | -+---------------+-------------------------------------------------------------------------------+ -| socket | :ref:`Network socket process <network-socket-process>` | -+---------------+-------------------------------------------------------------------------------+ -| forkserver | :ref:`Fork server <fork-server>` | -+---------------+-------------------------------------------------------------------------------+ -| utility | :ref:`Utility process <utility-process>` | -+---------------+-------------------------------------------------------------------------------+ - -.. _remote-process-types: - -Remote Process Types --------------------- - -The optional ``remoteType`` field contains the type of the content process that -crashed. As such it is present only if ``processType`` contains the ``content`` -value. The following content process types are currently defined: - -+-----------+--------------------------------------------------------+ -| Type | Description | -+===========+========================================================+ -| web | The content process was running code from a web page | -+-----------+--------------------------------------------------------+ -| file | The content process was running code from a local file | -+-----------+--------------------------------------------------------+ -| extension | The content process was running code from an extension | -+-----------+--------------------------------------------------------+ - -Stack Traces ------------- - -The crash ping may contain a ``stackTraces`` field which has been populated -with stack traces for all threads in the crashed process. The format of this -field is similar to the one used by Socorro for representing a crash. The main -differences are that redundant fields are not stored and that the module a -frame belongs to is referenced by index in the module array rather than by its -file name. - -Note that this field does not contain data from the application; only bare -stack traces and module lists are stored. - -.. code-block:: js - - { - status: <string>, // Status of the analysis, "OK" or an error message - crash_info: { // Basic crash information - type: <string>, // Type of crash, SIGSEGV, assertion, etc... - address: <addr>, // Crash address crash, hex format, see the notes below - crashing_thread: <index> // Index in the thread array below - }, - main_module: <index>, // Index of Firefox' executable in the module list - modules: [{ - base_addr: <addr>, // Base address of the module, hex format - end_addr: <addr>, // End address of the module, hex format - code_id: <string>, // Unique ID of this module, see the notes below - debug_file: <string>, // Name of the file holding the debug information - debug_id: <string>, // ID or hash of the debug information file - filename: <string>, // File name - version: <string>, // Library/executable version - }, - ... // List of modules ordered by base memory address - ], - threads: [{ // Stack traces for every thread - frames: [{ - module_index: <index>, // Index of the module this frame belongs to - ip: <ip>, // Program counter, hex format - trust: <string> // Trust of this frame, see the notes below - }, - ... // List of frames, the first frame is the topmost - ] - }] - } - -Notes -~~~~~ - -Memory addresses and instruction pointers are always stored as strings in -hexadecimal format (e.g. "0x4000"). They can be made of up to 16 characters for -64-bit addresses. - -The crash type is both OS and CPU dependent and can be either a descriptive -string (e.g. SIGSEGV, EXCEPTION_ACCESS_VIOLATION) or a raw numeric value. The -crash address meaning depends on the type of crash. In a segmentation fault the -crash address will be the memory address whose access caused the fault; in a -crash triggered by an illegal instruction exception the address will be the -instruction pointer where the invalid instruction resides. -See `breakpad <https://chromium.googlesource.com/breakpad/breakpad/+/c99d374dde62654a024840accfb357b2851daea0/src/processor/minidump_processor.cc#675>`__'s -relevant code for further information. - -Since it's not always possible to establish with certainty the address of the -previous frame while walking the stack, every frame has a trust value that -represents how it was found and thus how certain we are that it's a real frame. -The trust levels are (from least trusted to most trusted): - -+---------------+---------------------------------------------------+ -| Trust | Description | -+===============+===================================================+ -| context | Given as instruction pointer in a context | -+---------------+---------------------------------------------------+ -| prewalked | Explicitly provided by some external stack walker | -+---------------+---------------------------------------------------+ -| cfi | Derived from call frame info | -+---------------+---------------------------------------------------+ -| frame_pointer | Derived from frame pointer | -+---------------+---------------------------------------------------+ -| cfi_scan | Found while scanning stack using call frame info | -+---------------+---------------------------------------------------+ -| scan | Scanned the stack, found this | -+---------------+---------------------------------------------------+ -| none | Unknown, this is most likely not a valid frame | -+---------------+---------------------------------------------------+ - -The ``code_id`` field holds a unique ID used to distinguish between different -versions and builds of the same module. See `breakpad <https://chromium.googlesource.com/breakpad/breakpad/+/24f5931c5e0120982c0cbf1896641e3ef2bdd52f/src/google_breakpad/processor/code_module.h#60>`__'s -description for further information. This field is populated only on Windows. - -Version History ---------------- - -- Firefox 58: Added ipc_channel_error (`bug 1410143 <https://bugzilla.mozilla.org/show_bug.cgi?id=1410143>`_). -- Firefox 62: Added LowCommitSpaceEvents (`bug 1464773 <https://bugzilla.mozilla.org/show_bug.cgi?id=1464773>`_). -- Firefox 63: Added RecordReplayError (`bug 1481009 <https://bugzilla.mozilla.org/show_bug.cgi?id=1481009>`_). -- Firefox 64: Added MemoryErrorCorrection (`bug 1498609 <https://bugzilla.mozilla.org/show_bug.cgi?id=1498609>`_). -- Firefox 68: Added IndexedDBShutdownTimeout and LocalStorageShutdownTimeout - (`bug 1539750 <https://bugzilla.mozilla.org/show_bug.cgi?id=1539750>`_). -- Firefox 74: Added AvailableSwapMemory and PurgeablePhysicalMemory - (`bug 1587721 <https://bugzilla.mozilla.org/show_bug.cgi?id=1587721>`_). -- Firefox 74: Added MainThreadRunnableName (`bug 1608158 <https://bugzilla.mozilla.org/show_bug.cgi?id=1608158>`_). -- Firefox 76: Added DOMFissionEnabled (`bug 1602918 <https://bugzilla.mozilla.org/show_bug.cgi?id=1602918>`_). -- Firefox 79: Added ExperimentalFeatures (`bug 1644544 <https://bugzilla.mozilla.org/show_bug.cgi?id=1644544>`_). -- Firefox 85: Added QuotaManagerShutdownTimeout, removed IndexedDBShutdownTimeout and LocalStorageShutdownTimeout - (`bug 1672369 <https://bugzilla.mozilla.org/show_bug.cgi?id=1672369>`_). -- Firefox 89: Added GPUProcessLaunchCount (`bug 1710448 <https://bugzilla.mozilla.org/show_bug.cgi?id=1710448>`_) - and ProfilerChildShutdownPhase (`bug 1704680 <https://bugzilla.mozilla.org/show_bug.cgi?id=1704680>`_). -- Firefox 90: Removed MemoryErrorCorrection (`bug 1710152 <https://bugzilla.mozilla.org/show_bug.cgi?id=1710152>`_) - and added WindowsErrorReporting (`bug 1703761 <https://bugzilla.mozilla.org/show_bug.cgi?id=1703761>`_). -- Firefox 95: Added HeadlessMode and BackgroundTaskName (`bug 1697875 <https://bugzilla.mozilla.org/show_bug.cgi?id=1697875>`_). -- Firefox 96: Added WindowsPackageFamilyName (`bug 1738375 <https://bugzilla.mozilla.org/show_bug.cgi?id=1738375>`_). -- Firefox 103: Removed ContainsMemoryReport (`bug 1776279 <https://bugzilla.mozilla.org/show_bug.cgi?id=1776279>`_). -- Firefox 107: Added UtilityActorsName (`bug 1788596 <https://bugzilla.mozilla.org/show_bug.cgi?id=1788596>`_). -- Firefox 119: Added WindowsFileDialogErrorCode (`bug 1837079 <https://bugzilla.mozilla.org/show_bug.cgi?id=1837079>`_) -- Firefox 137: Added NimbusEnrollments (`bug 1950661 <https://bugzilla.mozilla.org/show_bug.cgi?id=1950661>`_). -- Firefox 138: Removed ExperimentalFeatures (`bug 1942694 <https://bugzilla.mozilla.org/show_bug.cgi?id=1942694>`_). diff --git a/toolkit/crashreporter/client/app/src/config.rs b/toolkit/crashreporter/client/app/src/config.rs @@ -100,6 +100,8 @@ pub struct Config { pub data_dir: Option<PathBuf>, /// The events directory. pub events_dir: Option<PathBuf>, + /// The ping directory. + pub ping_dir: Option<PathBuf>, /// The profile directory in use when the crash occurred. pub profile_dir: Option<PathBuf>, /// The dump file. @@ -149,6 +151,7 @@ impl Config { self.run_memtest = env_bool(ekey!("RUN_MEMTEST")); self.data_dir = env_path(ekey!("DATA_DIRECTORY")); self.events_dir = env_path(ekey!("EVENTS_DIRECTORY")); + self.ping_dir = env_path(ekey!("PING_DIRECTORY")); self.app_file = std::env::var_os(ekey!("RESTART_XUL_APP_FILE")); self.update_log_file(); diff --git a/toolkit/crashreporter/client/app/src/logic.rs b/toolkit/crashreporter/client/app/src/logic.rs @@ -22,6 +22,7 @@ use crate::{ ui::{ReportCrashUI, ReportCrashUIState, SubmitState}, }; use anyhow::Context; +use uuid::Uuid; pub mod annotations; @@ -78,8 +79,8 @@ impl ReportCrash { log::warn!("failed to compute minidump hash: {e:#}"); None }); - self.send_crash_ping(hash.as_deref()); - if let Err(e) = self.update_events_file(hash.as_deref()) { + let ping_uuid = self.send_crash_ping(hash.as_deref()); + if let Err(e) = self.update_events_file(hash.as_deref(), ping_uuid) { log::warn!("failed to update events file: {e:#}"); } self.check_eol_version()?; @@ -123,17 +124,26 @@ impl ReportCrash { } /// Send crash pings to legacy telemetry and Glean. - fn send_crash_ping(&self, minidump_hash: Option<&str>) { + /// + /// Returns the crash ping uuid used in legacy telemetry. + fn send_crash_ping(&self, minidump_hash: Option<&str>) -> Option<Uuid> { net::ping::CrashPing { + crash_id: self.config.local_dump_id().as_ref(), extra: &self.extra, + ping_dir: self.config.ping_dir.as_deref(), minidump_hash, + pingsender_path: crate::config::installation_program_path("pingsender").as_ref(), } .send() } /// Update the events file with information about the crash ping, minidump hash, and /// stacktraces. - fn update_events_file(&self, minidump_hash: Option<&str>) -> anyhow::Result<()> { + fn update_events_file( + &self, + minidump_hash: Option<&str>, + ping_uuid: Option<Uuid>, + ) -> anyhow::Result<()> { use crate::std::io::{BufRead, Error, ErrorKind, Write}; struct EventsFile { event_version: String, @@ -196,6 +206,9 @@ impl ReportCrash { if let Some(hash) = minidump_hash { events_file.data["MinidumpSha256Hash"] = hash.into(); } + if let Some(uuid) = ping_uuid { + events_file.data["CrashPingUUID"] = uuid.to_string().into(); + } events_file.data["StackTraces"] = self.extra["StackTraces"].clone(); // Write altered events file. diff --git a/toolkit/crashreporter/client/app/src/logic/annotations.rs b/toolkit/crashreporter/client/app/src/logic/annotations.rs @@ -9,6 +9,11 @@ // static REPORT_ANNOTATIONS: phf::Set<&'static str>; include!(concat!(env!("OUT_DIR"), "/crash_annotations.rs")); +/// Return whether the given annotation can be sent in a crash ping. +pub fn send_in_ping(annotation: &str) -> bool { + PING_ANNOTATIONS.contains(annotation) +} + /// Return whether the given annotation can be sent in a crash report. pub fn send_in_report(annotation: &str) -> bool { REPORT_ANNOTATIONS.contains(annotation) || PING_ANNOTATIONS.contains(annotation) diff --git a/toolkit/crashreporter/client/app/src/main.rs b/toolkit/crashreporter/client/app/src/main.rs @@ -146,6 +146,7 @@ fn report_main() { // Actual content doesn't matter, aside from the hash that is generated. const MOCK_MINIDUMP_FILE: &[u8] = &[1, 2, 3, 4]; const MOCK_CURRENT_TIME: &str = "2004-11-09T12:34:56Z"; + const MOCK_PING_UUID: uuid::Uuid = uuid::Uuid::nil(); const MOCK_REMOTE_CRASH_ID: &str = "8cbb847c-def2-4f68-be9e-000000000000"; // Initialize logging but don't set it in the configuration, so that it won't be redirected to @@ -189,12 +190,14 @@ fn report_main() { .unwrap() .into(), ) + .set(mock::MockHook::new("ping_uuid"), MOCK_PING_UUID) .set(mock::MockHook::new("enable_glean_pings"), false); let result = mock.run(|| { let mut cfg = Config::new(); cfg.data_dir = Some("data_dir".into()); cfg.events_dir = Some("events_dir".into()); + cfg.ping_dir = Some("ping_dir".into()); cfg.dump_file = Some("minidump.dmp".into()); cfg.restart_command = Some("mockfox".into()); cfg.strings = Some(lang::load()); diff --git a/toolkit/crashreporter/client/app/src/net/http.rs b/toolkit/crashreporter/client/app/src/net/http.rs @@ -66,6 +66,9 @@ pub fn user_agent() -> &'static str { pub enum RequestBuilder<'a> { /// Send a POST with multiple mime parts. MimePost { parts: Vec<MimePart<'a>> }, + /// Gzip and POST a file's contents. + #[allow(unused)] + GzipAndPostFile { file: &'a Path }, /// Send a POST. Post { body: &'a [u8], @@ -115,6 +118,28 @@ pub enum Request<'a> { }, } +/// Format a `time::Date` using the date format described by RFC 7231, section 7.1.1.2, for use in +/// the HTTP Date header. +fn format_rfc7231_datetime(datetime: time::OffsetDateTime) -> anyhow::Result<String> { + let format = time::macros::format_description!( + "[weekday repr:short], [day] [month repr:short] [year] [hour]:[minute]:[second] GMT" + ); + datetime + .to_offset(time::UtcOffset::UTC) + .format(format) + .context("failed to format datetime") +} + +fn now_date_header() -> Option<String> { + match format_rfc7231_datetime(time::OffsetDateTime::now_utc()) { + Err(e) => { + log::warn!("failed to format Date header, omitting: {e}"); + None + } + Ok(s) => Some(format!("Date: {s}")), + } +} + impl<'a> RequestBuilder<'a> { /// Build the request with the given url. pub fn build(&self, url: &'a str) -> std::io::Result<Request<'a>> { @@ -214,6 +239,15 @@ impl<'a> RequestBuilder<'a> { part.curl_command_args(&mut cmd, &mut stdin)?; } } + Self::GzipAndPostFile { file } => { + cmd.args(["--header", "Content-Encoding: gzip", "--data-binary", "@-"]); + if let Some(header) = now_date_header() { + cmd.args(["--header", &header]); + } + + let encoder = flate2::read::GzEncoder::new(File::open(file)?, Default::default()); + stdin = Some(Box::new(encoder)); + } Self::Post { body, headers } => { for (k, v) in headers.iter() { cmd.args(["--header", &format!("{k}: {v}")]); @@ -253,6 +287,20 @@ impl<'a> RequestBuilder<'a> { easy.set_mime_post(mime)?; } + Self::GzipAndPostFile { file } => { + let mut headers = easy.slist(); + headers.append("Content-Encoding: gzip")?; + if let Some(header) = now_date_header() { + headers.append(&header)?; + } + easy.set_headers(headers)?; + + let mut encoder = + flate2::read::GzEncoder::new(File::open(file)?, Default::default()); + let mut data = Vec::new(); + encoder.read_to_end(&mut data)?; + easy.set_postfields(data)?; + } Self::Post { body, headers } => { let mut header_list = easy.slist(); for (k, v) in headers.iter() { diff --git a/toolkit/crashreporter/client/app/src/net/ping/legacy_telemetry.rs b/toolkit/crashreporter/client/app/src/net/ping/legacy_telemetry.rs @@ -0,0 +1,190 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +//! Support for legacy telemetry ping creation. The ping supports serialization which should be +//! used when submitting. + +use crate::logic::annotations; +use crate::std; +use anyhow::Context; +use serde::Serialize; +use std::collections::BTreeMap; +use time::format_description::well_known::{iso8601, Iso8601}; +use uuid::Uuid; + +const TELEMETRY_VERSION: u64 = 4; +const PAYLOAD_VERSION: u64 = 1; + +// We need a custom time serializer to encode at most 3 decimal digits in the fractions of a second +// (millisecond precision). +const TIME_CONFIG: iso8601::EncodedConfig = iso8601::Config::DEFAULT + .set_time_precision(iso8601::TimePrecision::Second { + // Safety: 3 is non-zero + decimal_digits: Some(unsafe { std::num::NonZeroU8::new_unchecked(3) }), + }) + .encode(); +const TIME_FORMAT: Iso8601<TIME_CONFIG> = Iso8601::<TIME_CONFIG>; +time::serde::format_description!(time_format, OffsetDateTime, TIME_FORMAT); + +#[derive(Serialize)] +#[serde( + tag = "type", + rename_all = "camelCase", + rename_all_fields = "camelCase" +)] +pub enum Ping<'a> { + Crash { + id: &'a Uuid, + version: u64, + #[serde(with = "time_format")] + creation_date: time::OffsetDateTime, + client_id: &'a str, + profile_group_id: &'a str, + #[serde(skip_serializing_if = "serde_json::Value::is_null")] + environment: serde_json::Value, + payload: Payload<'a>, + application: Application<'a>, + }, +} + +time::serde::format_description!(date_format, Date, "[year]-[month]-[day]"); + +#[derive(Serialize)] +#[serde(rename_all = "camelCase")] +pub struct Payload<'a> { + session_id: &'a str, + version: u64, + #[serde(with = "date_format")] + crash_date: time::Date, + #[serde(with = "time_format")] + crash_time: time::OffsetDateTime, + has_crash_environment: bool, + crash_id: &'a str, + minidump_sha256_hash: Option<&'a str>, + process_type: &'a str, + #[serde(skip_serializing_if = "serde_json::Value::is_null")] + stack_traces: serde_json::Value, + metadata: BTreeMap<&'a str, &'a str>, +} + +#[derive(Serialize)] +#[serde(rename_all = "camelCase")] +pub struct Application<'a> { + vendor: &'a str, + name: &'a str, + build_id: &'a str, + display_version: String, + platform_version: String, + version: &'a str, + channel: &'a str, + #[serde(skip_serializing_if = "Option::is_none")] + architecture: Option<String>, + #[serde(skip_serializing_if = "Option::is_none")] + xpcom_abi: Option<String>, +} + +impl<'a> Ping<'a> { + pub fn crash( + ping_id: &'a Uuid, + extra: &'a serde_json::Value, + crash_id: &'a str, + minidump_sha256_hash: Option<&'a str>, + ) -> anyhow::Result<Self> { + let now: time::OffsetDateTime = crate::std::time::SystemTime::now().into(); + let environment: serde_json::Value = extra["TelemetryEnvironment"] + .as_str() + .and_then(|estr| serde_json::from_str(estr).ok()) + .unwrap_or_default(); + + // The subset of extra file entries (crash annotations) which are allowed in pings. + let metadata = extra + .as_object() + .map(|map| { + map.iter() + .filter_map(|(k, v)| { + annotations::send_in_ping(k) + .then(|| k.as_str()) + .zip(v.as_str()) + }) + .collect() + }) + .unwrap_or_default(); + + let display_version = environment + .pointer("/build/displayVersion") + .and_then(|s| s.as_str()) + .unwrap_or_default() + .to_owned(); + let platform_version = environment + .pointer("/build/platformVersion") + .and_then(|s| s.as_str()) + .unwrap_or_default() + .to_owned(); + let architecture = environment + .pointer("/build/architecture") + .and_then(|s| s.as_str()) + .map(ToOwned::to_owned); + let xpcom_abi = environment + .pointer("/build/xpcomAbi") + .and_then(|s| s.as_str()) + .map(ToOwned::to_owned); + + Ok(Ping::Crash { + id: ping_id, + version: TELEMETRY_VERSION, + creation_date: now, + client_id: extra["TelemetryClientId"] + .as_str() + .context("missing TelemetryClientId")?, + profile_group_id: extra["TelemetryProfileGroupId"] + .as_str() + .context("missing TelemetryProfileGroupId")?, + environment, + payload: Payload { + session_id: extra["TelemetrySessionId"] + .as_str() + .context("missing TelemetrySessionId")?, + version: PAYLOAD_VERSION, + crash_date: now.date(), + crash_time: now, + has_crash_environment: true, + crash_id, + minidump_sha256_hash, + process_type: "main", + stack_traces: extra["StackTraces"].clone(), + metadata, + }, + application: Application { + vendor: extra["Vendor"].as_str().unwrap_or_default(), + name: extra["ProductName"].as_str().unwrap_or_default(), + build_id: extra["BuildID"].as_str().unwrap_or_default(), + display_version, + platform_version, + version: extra["Version"].as_str().unwrap_or_default(), + channel: extra["ReleaseChannel"].as_str().unwrap_or_default(), + architecture, + xpcom_abi, + }, + }) + } + + /// Generate the telemetry URL for submitting this ping. + pub fn submission_url(&self, extra: &serde_json::Value) -> anyhow::Result<String> { + let url = extra["TelemetryServerURL"] + .as_str() + .context("missing TelemetryServerURL")?; + let id = match self { + Self::Crash { id, .. } => id, + }; + let name = extra["ProductName"] + .as_str() + .context("missing ProductName")?; + let version = extra["Version"].as_str().context("missing Version")?; + let channel = extra["ReleaseChannel"] + .as_str() + .context("missing ReleaseChannel")?; + let buildid = extra["BuildID"].as_str().context("missing BuildID")?; + Ok(format!("{url}/submit/telemetry/{id}/crash/{name}/{version}/{channel}/{buildid}?v={TELEMETRY_VERSION}")) + } +} diff --git a/toolkit/crashreporter/client/app/src/net/ping/mod.rs b/toolkit/crashreporter/client/app/src/net/ping/mod.rs @@ -4,26 +4,45 @@ //! Crash pings. -use crate::std::mock; +use crate::std; +use crate::std::path::Path; +use anyhow::Context; +use uuid::Uuid; mod glean; +mod legacy_telemetry; pub struct CrashPing<'a> { + pub crash_id: &'a str, pub extra: &'a serde_json::Value, + pub ping_dir: Option<&'a Path>, pub minidump_hash: Option<&'a str>, + pub pingsender_path: &'a Path, } impl CrashPing<'_> { /// Send the crash ping. - pub fn send(&self) { + /// + /// Returns the crash ping id if the ping could be sent. Any errors are logged. + pub fn send(&self) -> Option<Uuid> { + let id = new_id(); + // Glean ping tests have to be run serially (because the glean interface is a global), but // we can run tests that are uninterested in glean pings in parallel by disabling the pings // here. - if mock::hook(true, "enable_glean_pings") { + if std::mock::hook(true, "enable_glean_pings") { if let Err(e) = self.send_glean() { log::error!("failed to send glean ping: {e:#}"); } } + + match self.send_legacy(&id) { + Err(e) => { + log::error!("failed to send legacy ping: {e:#}"); + None + } + Ok(sent) => sent.then_some(id), + } } fn send_glean(&self) -> anyhow::Result<()> { @@ -32,4 +51,48 @@ impl CrashPing<'_> { crate::glean::crash.submit(Some("crash")); Ok(()) } + + fn send_legacy(&self, id: &Uuid) -> anyhow::Result<bool> { + let Some(ping_dir) = self.ping_dir else { + log::warn!("not sending legacy crash ping because no ping directory configured"); + return Ok(false); + }; + + let ping = legacy_telemetry::Ping::crash(id, self.extra, self.crash_id, self.minidump_hash) + .context("failed to create telemetry crash ping")?; + + let submission_url = ping + .submission_url(self.extra) + .context("failed to generate ping submission URL")?; + + let target_file = ping_dir.join(format!("{}.json", id)); + + let file = std::fs::File::create(&target_file).with_context(|| { + format!( + "failed to open ping file {} for writing", + target_file.display() + ) + })?; + + serde_json::to_writer(file, &ping).context("failed to serialize telemetry crash ping")?; + + crate::process::background_command(self.pingsender_path) + .arg(submission_url) + .arg(target_file) + .spawn() + .with_context(|| { + format!( + "failed to launch pingsender process at {}", + self.pingsender_path.display() + ) + })?; + + // TODO asynchronously get pingsender result and log it? + + Ok(true) + } +} + +fn new_id() -> Uuid { + crate::std::mock::hook(Uuid::new_v4(), "ping_uuid") } diff --git a/toolkit/crashreporter/client/app/src/test.rs b/toolkit/crashreporter/client/app/src/test.rs @@ -111,7 +111,9 @@ macro_rules! current_date { "2004-11-09" }; } +const MOCK_CURRENT_DATE: &str = current_date!(); const MOCK_CURRENT_TIME: &str = concat!(current_date!(), "T12:34:56.000Z"); +const MOCK_PING_UUID: uuid::Uuid = uuid::Uuid::nil(); const MOCK_REMOTE_CRASH_ID: &str = "8cbb847c-def2-4f68-be9e-000000000000"; fn current_datetime() -> time::OffsetDateTime { @@ -135,6 +137,7 @@ fn test_config() -> Config { let mut cfg = Config::default(); cfg.data_dir = Some("data_dir".into()); cfg.events_dir = Some("events_dir".into()); + cfg.ping_dir = Some("ping_dir".into()); cfg.dump_file = Some("minidump.dmp".into()); cfg.strings = Some(Default::default()); // Set delete_dump to true: this matches the default case in practice. @@ -208,7 +211,8 @@ impl GuiTest { ) .set(crate::std::env::MockTempDir, "tmp".into()) .set(crate::std::time::MockCurrentTime, current_system_time()) - .set(mock::MockHook::new("enable_glean_pings"), false); + .set(mock::MockHook::new("enable_glean_pings"), false) + .set(mock::MockHook::new("ping_uuid"), MOCK_PING_UUID); GuiTest { config: test_config(), @@ -369,6 +373,52 @@ impl AssertFiles { self } + /// Assert that a crash ping was created for sending according to the filesystem. + pub fn ping(&mut self) -> &mut Self { + self.inner.check( + format!("ping_dir/{MOCK_PING_UUID}.json"), + serde_json::json! {{ + "type": "crash", + "id": MOCK_PING_UUID, + "version": 4, + "creationDate": MOCK_CURRENT_TIME, + "clientId": "telemetry_client", + "profileGroupId": "telemetry_profile_group", + "payload": { + "sessionId": "telemetry_session", + "version": 1, + "crashDate": MOCK_CURRENT_DATE, + "crashTime": MOCK_CURRENT_TIME, + "hasCrashEnvironment": true, + "crashId": "minidump", + "minidumpSha256Hash": MOCK_MINIDUMP_SHA256, + "processType": "main", + "stackTraces": { + "status": "OK" + }, + "metadata": { + "AsyncShutdownTimeout": "{}", + "BuildID": "1234", + "ProductName": "Bar", + "ReleaseChannel": "release", + "Version": "100.0", + } + }, + "application": { + "vendor": "FooCorp", + "name": "Bar", + "buildId": "1234", + "displayVersion": "", + "platformVersion": "", + "version": "100.0", + "channel": "release" + } + }} + .to_string(), + ); + self + } + /// Assert that a crash submission event was written with the given submission status. pub fn submission_event(&mut self, success: bool) -> &mut Self { self.inner.check( @@ -670,13 +720,16 @@ fn no_submit() { #[test] fn ping_and_event_files() { let mut test = GuiTest::new(); - test.files.add_dir("events_dir").add_file( - "events_dir/minidump", - "1\n\ + test.files + .add_dir("ping_dir") + .add_dir("events_dir") + .add_file( + "events_dir/minidump", + "1\n\ 12:34:56\n\ e0423878-8d59-4452-b82e-cad9c846836e\n\ {\"foo\":\"bar\"}", - ); + ); test.run(|interact| { interact.element("quit", |_style, b: &model::Button| b.click.fire(&())); }); @@ -684,6 +737,7 @@ fn ping_and_event_files() { .saved_settings(Settings::default()) .submitted() .submission_event(true) + .ping() .check( "events_dir/minidump", format!( @@ -694,6 +748,7 @@ fn ping_and_event_files() { serde_json::json! {{ "foo": "bar", "MinidumpSha256Hash": MOCK_MINIDUMP_SHA256, + "CrashPingUUID": MOCK_PING_UUID, "StackTraces": { "status": "OK" } }} ), @@ -704,13 +759,16 @@ fn ping_and_event_files() { fn network_failure() { let invoked = Counter::new(); let mut test = GuiTest::new(); - test.files.add_dir("events_dir").add_file( - "events_dir/minidump", - "1\n\ + test.files + .add_dir("ping_dir") + .add_dir("events_dir") + .add_file( + "events_dir/minidump", + "1\n\ 12:34:56\n\ e0423878-8d59-4452-b82e-cad9c846836e\n\ {\"foo\":\"bar\"}", - ); + ); test.mock.set( net::http::MockHttp, Box::new(cc! { (invoked) move |_request, _url| { @@ -726,6 +784,7 @@ fn network_failure() { .saved_settings(Settings::default()) .pending() .submission_event(false) + .ping() .check( "events_dir/minidump", format!( @@ -736,6 +795,7 @@ fn network_failure() { serde_json::json! {{ "foo": "bar", "MinidumpSha256Hash": MOCK_MINIDUMP_SHA256, + "CrashPingUUID": MOCK_PING_UUID, "StackTraces": { "status": "OK" } }} ), @@ -749,13 +809,16 @@ fn pingsender_failure() { Command::mock("work_dir/pingsender"), Box::new(|_| Err(ErrorKind::NotFound.into())), ); - test.files.add_dir("events_dir").add_file( - "events_dir/minidump", - "1\n\ + test.files + .add_dir("ping_dir") + .add_dir("events_dir") + .add_file( + "events_dir/minidump", + "1\n\ 12:34:56\n\ e0423878-8d59-4452-b82e-cad9c846836e\n\ {\"foo\":\"bar\"}", - ); + ); test.run(|interact| { interact.element("quit", |_style, b: &model::Button| b.click.fire(&())); }); @@ -763,6 +826,7 @@ fn pingsender_failure() { .saved_settings(Settings::default()) .submitted() .submission_event(true) + .ping() .check( "events_dir/minidump", format!(