commit 1abede9c2655529600e24ca666bd03f3ec788a26
parent f0361e4a5f69a5012e1171b47ed0a60a35110802
Author: Kui-Feng Lee <thinker.li@gmail.com>
Date: Fri, 12 Dec 2025 20:57:53 +0000
Bug 1995907 - Implement Glean event for OOM killer with PSI context r=nika,jld
This patch records memory pressure metrics via Glean when the Linux OOM killer
terminates content processes with SIGKILL. Since SIGKILL prevents crash reporters
from capturing PSI (Pressure Stall Information) data, the parent process detects
SIGKILL-terminated children and records memory pressure information.
The implementation:
- Adds process_oom_killed Glean event in xpcom/metrics.yaml with PSI extra keys
(psi_some_avg10, psi_some_avg60, psi_full_avg10, psi_full_avg60)
- Updates process_watcher_posix_sigchld.cc to use type-safe Glean API
- Removes legacy telemetry event from Events.yaml
- Uses PSI values from AvailableMemoryWatcher's periodic capture
- Dispatches recording to main thread for safe memory access
This modernizes the telemetry infrastructure by using Glean's structured metrics
instead of legacy event strings, providing better type safety and validation.
The PSIInfo struct is made accessible via GetLastPSISnapshot() API to allow
any component to read current PSI values on Linux.
Differential Revision: https://phabricator.services.mozilla.com/D269695
Diffstat:
7 files changed, 157 insertions(+), 17 deletions(-)
diff --git a/ipc/chromium/chromium-config.mozbuild b/ipc/chromium/chromium-config.mozbuild
@@ -9,6 +9,12 @@ LOCAL_INCLUDES += [
"/ipc/chromium/src",
]
+if CONFIG["OS_TARGET"] == "Linux":
+ LOCAL_INCLUDES += [
+ "/toolkit/components/telemetry",
+ "/xpcom/base",
+ ]
+
if CONFIG["OS_ARCH"] == "WINNT":
OS_LIBS += [
"shell32",
diff --git a/ipc/chromium/src/chrome/common/process_watcher_posix_sigchld.cc b/ipc/chromium/src/chrome/common/process_watcher_posix_sigchld.cc
@@ -31,6 +31,12 @@
# include "mozilla/ipc/ForkServiceChild.h"
#endif
+#if defined(XP_LINUX) && !defined(ANDROID)
+# include "mozilla/AvailableMemoryWatcher.h"
+# include "mozilla/glean/XpcomMetrics.h"
+# include "nsPrintfCString.h"
+#endif
+
// Just to make sure the moz.build is doing the right things with
// TARGET_OS and/or OS_TARGET:
#if defined(MOZ_WIDGET_ANDROID) || defined(MOZ_WIDGET_UIKIT)
@@ -99,6 +105,26 @@ static mozilla::StaticDataMutex<mozilla::StaticAutoPtr<nsTArray<PendingChild>>>
static int gSignalPipe[2] = {-1, -1};
static mozilla::Atomic<bool> gProcessWatcherShutdown;
+#if defined(XP_LINUX) && !defined(ANDROID)
+// Record Glean event when a content process is killed by OOM killer
+static void RecordContentProcessOOMKilled() {
+ // Get PSI data
+ mozilla::PSIInfo psi;
+ nsresult rv = mozilla::GetLastPSISnapshot(psi);
+
+ if (NS_SUCCEEDED(rv)) {
+ // Record Glean event with PSI metrics
+ mozilla::glean::memory_watcher::process_oom_killed.Record(
+ mozilla::Some(mozilla::glean::memory_watcher::ProcessOomKilledExtra{
+ mozilla::Some(nsPrintfCString("%lu", psi.some_avg10)),
+ mozilla::Some(nsPrintfCString("%lu", psi.some_avg60)),
+ mozilla::Some(nsPrintfCString("%lu", psi.full_avg10)),
+ mozilla::Some(nsPrintfCString("%lu", psi.full_avg60)),
+ }));
+ }
+}
+#endif
+
// A wrapper around WaitForProcess to simplify the result (true if the
// process exited and the pid is now freed for reuse, false if it's
// still running), and handle the case where "blocking" mode doesn't
@@ -133,6 +159,14 @@ static bool IsProcessDead(pid_t pid, BlockingWait aBlock) {
case base::ProcessStatus::Killed:
CHROMIUM_LOG(WARNING)
<< "process " << pid << " exited on signal " << info;
+#if defined(XP_LINUX) && !defined(ANDROID)
+ // Record telemetry for OOM kills
+ if (info == SIGKILL) {
+ NS_DispatchToMainThread(
+ NS_NewRunnableFunction("ContentProcessOOMTelemetry",
+ []() { RecordContentProcessOOMKilled(); }));
+ }
+#endif
return true;
case base::ProcessStatus::Error:
diff --git a/xpcom/base/AvailableMemoryWatcher.h b/xpcom/base/AvailableMemoryWatcher.h
@@ -17,6 +17,30 @@
namespace mozilla {
+#if defined(XP_LINUX) && !defined(ANDROID)
+// PSIInfo struct holds parsed data from /proc/pressure/memory
+//
+// The values in /proc/pressure/memory are floating point numbers, but
+// PSIInfo has integer members (truncated values).
+struct PSIInfo {
+ unsigned long some_avg10 = 0;
+ unsigned long some_avg60 = 0;
+ unsigned long some_avg300 = 0;
+ unsigned long some_total = 0;
+ unsigned long full_avg10 = 0;
+ unsigned long full_avg60 = 0;
+ unsigned long full_avg300 = 0;
+ unsigned long full_total = 0;
+};
+
+// Get PSI (Pressure Stall Information) data from the last periodic update.
+// This function can be called from any thread and returns the most recently
+// captured PSI values from /proc/pressure/memory.
+// Returns NS_OK if successful, NS_ERROR_FAILURE if PSI is not available
+// or the file format is invalid.
+nsresult GetLastPSISnapshot(PSIInfo& aResult);
+#endif
+
// This class implements a platform-independent part to watch the system's
// memory situation and invoke the registered callbacks when we detect
// a low-memory situation or a high-memory situation.
diff --git a/xpcom/base/AvailableMemoryWatcherLinux.cpp b/xpcom/base/AvailableMemoryWatcherLinux.cpp
@@ -18,25 +18,12 @@
#include "nsString.h"
#include <cstring>
#include <cstdio>
+#if !defined(ANDROID)
+# include "nsIPSIProvider.h"
+#endif
namespace mozilla {
-/* PSIInfo struct holds parsed data from /proc/pressure/memory
- *
- * The values in /proc/pressure/memory are floating point numbers, but
- * PSIInfo has integer members.
- */
-struct PSIInfo {
- unsigned long some_avg10 = 0;
- unsigned long some_avg60 = 0;
- unsigned long some_avg300 = 0;
- unsigned long some_total = 0;
- unsigned long full_avg10 = 0;
- unsigned long full_avg60 = 0;
- unsigned long full_avg300 = 0;
- unsigned long full_total = 0;
-};
-
// Read PSI (Pressure Stall Information) data from /proc/pressure/memory
static nsresult ReadPSIFile(const char* aPSIPath, PSIInfo& aResult) {
ScopedCloseFile file(fopen(aPSIPath, "r"));
@@ -106,6 +93,9 @@ class nsAvailableMemoryWatcher final
: public nsITimerCallback,
public nsINamed,
public nsAvailableMemoryWatcherBase,
+#if !defined(ANDROID)
+ public nsIPSIProvider,
+#endif
public nsIAvailableMemoryWatcherTestingLinux {
public:
NS_DECL_ISUPPORTS_INHERITED
@@ -120,6 +110,10 @@ class nsAvailableMemoryWatcher final
void HandleLowMemory();
void MaybeHandleHighMemory();
+#if !defined(ANDROID)
+ NS_IMETHOD GetCachedPSIInfo(mozilla::PSIInfo& aResult) override;
+#endif
+
private:
~nsAvailableMemoryWatcher();
void StartPolling(const MutexAutoLock&);
@@ -167,6 +161,31 @@ nsAvailableMemoryWatcher::nsAvailableMemoryWatcher()
nsAvailableMemoryWatcher::~nsAvailableMemoryWatcher() {}
+NS_IMETHODIMP
+nsAvailableMemoryWatcher::GetCachedPSIInfo(mozilla::PSIInfo& aResult) {
+ MutexAutoLock lock(mMutex);
+ aResult = mPSIInfo;
+ return NS_OK;
+}
+
+// Public API to get latest cached PSI snapshot from the singleton
+// This returns the PSI data that was last collected by the watcher
+nsresult GetLastPSISnapshot(PSIInfo& aResult) {
+ RefPtr<nsIAvailableMemoryWatcherBase> watcher =
+ nsAvailableMemoryWatcherBase::GetSingleton();
+ if (!watcher) {
+ return NS_ERROR_NOT_AVAILABLE;
+ }
+
+ nsCOMPtr<nsIPSIProvider> provider = do_QueryInterface(watcher);
+
+ if (!provider) {
+ return NS_ERROR_NOT_AVAILABLE;
+ }
+
+ return provider->GetCachedPSIInfo(aResult);
+}
+
nsresult nsAvailableMemoryWatcher::Init() {
nsresult rv = nsAvailableMemoryWatcherBase::Init();
if (NS_FAILED(rv)) {
@@ -207,7 +226,7 @@ already_AddRefed<nsAvailableMemoryWatcherBase> CreateAvailableMemoryWatcher() {
NS_IMPL_ISUPPORTS_INHERITED(nsAvailableMemoryWatcher,
nsAvailableMemoryWatcherBase, nsITimerCallback,
- nsIObserver, nsINamed,
+ nsIObserver, nsINamed, nsIPSIProvider,
nsIAvailableMemoryWatcherTestingLinux);
void nsAvailableMemoryWatcher::StopPolling(const MutexAutoLock&)
diff --git a/xpcom/base/moz.build b/xpcom/base/moz.build
@@ -27,6 +27,9 @@ if CONFIG["OS_TARGET"] == "Linux":
XPIDL_SOURCES += [
"nsIAvailableMemoryWatcherTestingLinux.idl",
]
+ EXPORTS += [
+ "nsIPSIProvider.h",
+ ]
if CONFIG["MOZ_WIDGET_TOOLKIT"] == "cocoa":
XPIDL_SOURCES += [
diff --git a/xpcom/base/nsIPSIProvider.h b/xpcom/base/nsIPSIProvider.h
@@ -0,0 +1,26 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef nsIPSIProvider_h__
+#define nsIPSIProvider_h__
+
+#include "nsISupports.h"
+
+namespace mozilla {
+struct PSIInfo;
+}
+
+// Interface to get PSI (Pressure Stall Information) data
+#define NS_IPSIPROVIDER_IID \
+ {0x3c2ba80c, 0x6603, 0x4edb, {0xb5, 0x0b, 0xab, 0x6c, 0x76, 0x98, 0x57, 0xc5}}
+
+class nsIPSIProvider : public nsISupports {
+ public:
+ NS_INLINE_DECL_STATIC_IID(NS_IPSIPROVIDER_IID)
+
+ NS_IMETHOD GetCachedPSIInfo(mozilla::PSIInfo& aResult) = 0;
+};
+
+#endif // nsIPSIProvider_h__
diff --git a/xpcom/metrics.yaml b/xpcom/metrics.yaml
@@ -56,6 +56,34 @@ memory_watcher:
type: string
telemetry_mirror: Memory_watcher_OnHighMemory_Stats
+ process_oom_killed:
+ type: event
+ description: >
+ Recorded when a content process is killed by the Linux OOM killer (SIGKILL).
+ PSI (Pressure Stall Information) metrics are included to understand memory
+ pressure context at the time of the kill. This helps analyze memory pressure
+ patterns and OOM kill correlation.
+ bugs:
+ - https://bugzilla.mozilla.org/show_bug.cgi?id=1995907
+ data_reviews:
+ - https://bugzilla.mozilla.org/show_bug.cgi?id=1995907
+ notification_emails:
+ - tli@mozilla.com
+ expires: never
+ extra_keys:
+ psi_some_avg10:
+ description: PSI some avg10 value (10-second average partial stall percentage)
+ type: string
+ psi_some_avg60:
+ description: PSI some avg60 value (60-second average partial stall percentage)
+ type: string
+ psi_full_avg10:
+ description: PSI full avg10 value (10-second average complete stall percentage)
+ type: string
+ psi_full_avg60:
+ description: PSI full avg60 value (60-second average complete stall percentage)
+ type: string
+
memory_phc:
slop:
type: memory_distribution