commit 43609422072d75775412da5729121bd121106e5e
parent c9dc037f1cccca15a836bca12ffad93109b62669
Author: Jamie Nicol <jnicol@mozilla.com>
Date: Wed, 22 Oct 2025 14:25:03 +0000
Bug 1995541 - Do not indefinitely attempt to launch non-content child processes if they fail with STARTED_BUSY. r=geckoview-reviewers,toolkit-telemetry-reviewers,janerik,tcampbell
On Android it is not possible to run multiple instances of non-content
child processes, as we have only declared a single service of each
type in the manifest. This is fine in normal operation as there only
needs to be a single instance of each, but for xpcshell tests and
certain geckoview-junit tests we run multiple GeckoRuntimes in
parallel, each of which will attempt to launch its own child
processes.
Currently when a child process launch fails with a STARTED_BUSY error
code we will indefinitely retry launching it. This works for content
processes as eventually we will find a service that is not in use, but
for non-content process this will never succeed. We end up spamming
the logcat with error messages, and eventually the parent process will
get killed by the OS for making too many binder requests.
In bug 1844829 we will implement a proper solution to allow multiple
instances of these processes. But in the meantime, this patch makes us
avoid indefinitely retrying to launch busy non-content child
processes. This does mean that child processes will quickly fail to
launch in certain tests, which is not ideal. But it is an improvement
on the current situation where they never successfully launch anyway,
and instead are stuck in a loop attempting to launch until either the
test completes or the process is killed.
This causes certain legacy telemetry tests to now fail on CI. In
reality these tests were already broken, but (after an increased
timeout) were incorrectly being marked as passing due to the process
eventually being killed. Legacy telemetry is not used on Android, so
these tests are now disabled.
Differential Revision: https://phabricator.services.mozilla.com/D269561
Diffstat:
2 files changed, 13 insertions(+), 8 deletions(-)
diff --git a/mobile/android/geckoview/src/main/java/org/mozilla/gecko/process/GeckoProcessManager.java b/mobile/android/geckoview/src/main/java/org/mozilla/gecko/process/GeckoProcessManager.java
@@ -864,13 +864,18 @@ public final class GeckoProcessManager extends IProcessManager.Stub {
if (error instanceof StartException) {
final StartException startError = (StartException) error;
- if (startError.errorCode == IChildProcess.STARTED_BUSY) {
+ if (isContent(info.type) && startError.errorCode == IChildProcess.STARTED_BUSY) {
// This process is owned by a different runtime, so we can't use
- // it. We will keep retrying indefinitely until we find a non-busy process.
+ // it. For content processes we will keep retrying indefinitely until
+ // we find a non-busy process.
// Note: this strategy is pretty bad, we go through each process in
// sequence until one works, the multiple runtime case is test-only
// for now, so that's ok. We can improve on this if we eventually
// end up needing something fancier.
+ // For non-content processes there is only a single service defined for
+ // each process type, meaning this will never succeed while an instance
+ // of that process is alive. We therefore do *not* want to retry
+ // indefinitely. See bug 1844829.
return start(info, retryLog);
}
}
diff --git a/toolkit/components/telemetry/tests/unit/xpcshell.toml b/toolkit/components/telemetry/tests/unit/xpcshell.toml
@@ -21,10 +21,10 @@ skip-if = ["os == 'android'"] # Disabled due to crashes (see bug 1331366)
tags = "addons"
["test_CoveragePing.js"]
-requesttimeoutfactor = 5 # Slow on Android
+skip-if = ["os == 'android'"] # Legacy telemetry is always disabled on Android
["test_EventPing.js"]
-requesttimeoutfactor = 5 # Slow on Android
+skip-if = ["os == 'android'"] # Legacy telemetry is always disabled on Android
tags = "coverage"
["test_HealthPing.js"]
@@ -35,10 +35,10 @@ skip-if = [
tags = "addons"
["test_MigratePendingPings.js"]
-requesttimeoutfactor = 3 # Slow on Android
+skip-if = ["os == 'android'"] # Legacy telemetry is always disabled on Android
["test_PingAPI.js"]
-requesttimeoutfactor = 3 # Slow on Android
+skip-if = ["os == 'android'"] # Legacy telemetry is always disabled on Android
["test_PingSender.js"]
skip-if = ["os == 'android'"]
@@ -76,7 +76,7 @@ skip-if = ["os == 'android' && processor == 'x86_64'"] # Disabled as Android/Ge
tags = "addons"
["test_TelemetryController_idle.js"]
-requesttimeoutfactor = 3 # Slow on Android
+skip-if = ["os == 'android'"] # Legacy telemetry is always disabled on Android
["test_TelemetryEnvironment.js"]
requesttimeoutfactor = 2 # Slow on Windows
@@ -122,7 +122,7 @@ skip-if = ["os == 'linux' && verify && debug"]
skip-if = ["os == 'android'"]
["test_TelemetrySession_activeTicks.js"]
-requesttimeoutfactor = 3 # Slow on Android
+skip-if = ["os == 'android'"] # Legacy telemetry is always disabled on Android
["test_TelemetryTimestamps.js"]