commit 61483b26f03028514ff769c6aab9dcbbe652a7e2
parent 01f977e353a442ef5b29b4eef91a74ca93a353ae
Author: Harshit <hsohaney@mozilla.com>
Date: Wed, 17 Dec 2025 19:36:34 +0000
Bug 2005776 - Add exponential backoff to backup retries. r=kpatenio,cdupuis
Differential Revision: https://phabricator.services.mozilla.com/D276543
Diffstat:
3 files changed, 221 insertions(+), 107 deletions(-)
diff --git a/browser/app/profile/firefox.js b/browser/app/profile/firefox.js
@@ -3487,7 +3487,7 @@ pref("browser.backup.template.fallback-download.aurora", "https://www.firefox.co
pref("browser.backup.template.fallback-download.nightly", "https://www.firefox.com/channel/desktop/?utm_medium=firefox-desktop&utm_source=html-backup");
pref("browser.backup.template.fallback-download.esr", " https://www.firefox.com/download/all/desktop-esr/?utm_medium=firefox-desktop&utm_source=html-backup");
pref("browser.backup.errorCode", 0);
-pref("browser.backup.backup-retry-limit", 100);
+pref("browser.backup.backup-retry-limit", 10);
pref("browser.backup.disabled-on-idle-backup-retry", false);
// Limit of number of unremovable staging directories and archives that are
// permitted before backup will stop making additional backups. Unremovable
diff --git a/browser/components/backup/BackupService.sys.mjs b/browser/components/backup/BackupService.sys.mjs
@@ -641,6 +641,26 @@ export class BackupService extends EventTarget {
static #errorRetries = 0;
/**
+ * Time to wait (in seconds) until the next backup attempt.
+ *
+ * This uses exponential backoff based on the number of consecutive
+ * failed backup attempts since the last successful backup.
+ *
+ * Backoff formula:
+ * 2^(retryCount) * 60
+ *
+ * Example:
+ * If 2 backup attempts have failed since the last successful backup,
+ * the next attempt will occur after:
+ *
+ * 2^2 * 60 = 240 seconds (4 minutes)
+ *
+ * This differs from minimumTimeBetweenBackupsSeconds, which is used to determine
+ * the time between successful backups.
+ */
+ static backoffSeconds = () => Math.pow(2, BackupService.#errorRetries) * 60;
+
+ /**
* @typedef {object} EnabledStatus
* @property {boolean} enabled
* True if the feature is enabled.
@@ -4434,35 +4454,88 @@ export class BackupService extends EventTarget {
}
/**
- * Calls BackupService.createBackup at the next moment when the event queue
- * is not busy with higher priority events. This is intentionally broken out
- * into its own method to make it easier to stub out in tests.
+ * Decide whether we should attempt a backup now.
*
- * @param {object} [options]
- * @param {boolean} [options.deletePreviousBackup]
- * @param {string} [options.reason]
+ * @returns {boolean}
*/
- createBackupOnIdleDispatch({ deletePreviousBackup = true, reason }) {
+ shouldAttemptBackup() {
let now = Math.floor(Date.now() / 1000);
- let errorStateDebugInfo = Services.prefs.getStringPref(
+ const debugInfoStr = Services.prefs.getStringPref(
BACKUP_DEBUG_INFO_PREF_NAME,
""
);
- // we retry failing backups every minimumTimeBetweenBackupsSeconds if
- // isRetryDisabledOnIdle is true. If isRetryDisabledOnIdle is false,
- // we retry on next idle until we hit backupRetryLimit and switch isRetryDisabledOnIdle to true
- if (
- lazy.isRetryDisabledOnIdle &&
- errorStateDebugInfo &&
- now - JSON.parse(errorStateDebugInfo).lastBackupAttempt <
- lazy.minimumTimeBetweenBackupsSeconds
- ) {
+ let parsed = null;
+ if (debugInfoStr) {
+ try {
+ parsed = JSON.parse(debugInfoStr);
+ } catch (e) {
+ lazy.logConsole.warn(
+ "Invalid backup debug-info pref; ignoring and allowing backup attempt.",
+ e
+ );
+ parsed = null;
+ }
+ }
+
+ const lastBackupAttempt = parsed?.lastBackupAttempt;
+ const hasErroredLastAttempt = Number.isFinite(lastBackupAttempt);
+
+ if (!hasErroredLastAttempt) {
+ lazy.logConsole.debug(
+ `There have been no errored last attempts, let's do a backup`
+ );
+ return true;
+ }
+
+ const secondsSinceLastAttempt = now - lastBackupAttempt;
+
+ if (lazy.isRetryDisabledOnIdle) {
+ // Let's add a buffer before restarting the retries. Dividing by 2
+ // since currently minimumTimeBetweenBackupsSeconds is set to 24 hours
+ // We want to approximately keep a backup for each day, so let's retry
+ // in about 12 hours again.
+ if (secondsSinceLastAttempt < lazy.minimumTimeBetweenBackupsSeconds / 2) {
+ lazy.logConsole.debug(
+ `Retrying is disabled, we have to wait for ${lazy.minimumTimeBetweenBackupsSeconds / 2}s to retry`
+ );
+ return false;
+ }
+ // Looks like we've waited enough, reset the retry states and try to create
+ // a backup again.
+ BackupService.#errorRetries = 0;
+ Services.prefs.clearUserPref(DISABLED_ON_IDLE_RETRY_PREF_NAME);
+
+ return true;
+ }
+
+ // Exponential backoff guard, avoids throttling the same error again and again
+ if (secondsSinceLastAttempt < BackupService.backoffSeconds()) {
lazy.logConsole.debug(
- `We've already retried in the last ${lazy.minimumTimeBetweenBackupsSeconds}s. Waiting for next valid idleDispatch to try again.`
+ `backoff: elapsed ${secondsSinceLastAttempt}s < backoff ${BackupService.backoffSeconds()}s`
);
+ return false;
+ }
+
+ return true;
+ }
+
+ /**
+ * Calls BackupService.createBackup at the next moment when the event queue
+ * is not busy with higher priority events. This is intentionally broken out
+ * into its own method to make it easier to stub out in tests.
+ *
+ * @param {object} [options]
+ * @param {boolean} [options.deletePreviousBackup]
+ * @param {string} [options.reason]
+ *
+ * @returns {Promise} A backup promise to hold onto
+ */
+ createBackupOnIdleDispatch({ deletePreviousBackup = true, reason }) {
+ if (!this.shouldAttemptBackup()) {
return Promise.resolve();
}
+
// Determine path to old backup file
const oldBackupFile = this.#_state.lastBackupFileName;
const isScheduledBackupsEnabled = lazy.scheduledBackupsPref;
@@ -4488,8 +4561,10 @@ export class BackupService extends EventTarget {
BackupService.#errorRetries += 1;
if (BackupService.#errorRetries > lazy.backupRetryLimit) {
- // We've had too many errors with retries, let's only backup on next timestamp
Services.prefs.setBoolPref(DISABLED_ON_IDLE_RETRY_PREF_NAME, true);
+ // Next retry will be 24 hours later (backoffSeconds = 2^(11) * 60s),
+ // let's just restart our backoff heuristic
+ BackupService.#errorRetries = 0;
Glean.browserBackup.backupThrottled.record();
}
} finally {
diff --git a/browser/components/backup/tests/xpcshell/test_BackupService_retryHeuristic.js b/browser/components/backup/tests/xpcshell/test_BackupService_retryHeuristic.js
@@ -6,6 +6,7 @@ https://creativecommons.org/publicdomain/zero/1.0/ */
ChromeUtils.defineESModuleGetters(this, {
BackupError: "resource:///modules/backup/BackupError.mjs",
ERRORS: "chrome://browser/content/backup/backup-constants.mjs",
+ TestUtils: "resource://testing-common/TestUtils.sys.mjs",
});
const BACKUP_RETRY_LIMIT_PREF_NAME = "browser.backup.backup-retry-limit";
@@ -18,6 +19,81 @@ const SCHEDULED_BACKUPS_ENABLED_PREF_NAME = "browser.backup.scheduled.enabled";
const BACKUP_DEBUG_INFO_PREF_NAME = "browser.backup.backup-debug-info";
const BACKUP_DEFAULT_LOCATION_PREF_NAME = "browser.backup.location";
+const RETRIES_FOR_TEST = 4;
+
+async function create_backup_failure_expected_calls(
+ bs,
+ callCount,
+ assertionMsg
+) {
+ assertionMsg = assertionMsg
+ ? assertionMsg
+ : `createBackup should be called ${callCount} times`;
+
+ let originalBackoffTime = BackupService.backoffSeconds();
+
+ bs.createBackupOnIdleDispatch({});
+
+ // testing that callCount remains the same, skip all the other checks
+ if (callCount == bs.createBackup.callCount) {
+ Assert.equal(bs.createBackup.callCount, callCount, assertionMsg);
+
+ return;
+ }
+
+ // Wait for in progress states to change
+ // so that the errorRetries can be updated
+
+ await bsInProgressStateUpdate(bs, true);
+ await bsInProgressStateUpdate(bs, false);
+
+ // propagate prefs
+ await TestUtils.waitForTick();
+
+ // have we called createBackup more times than allowed retries?
+ // if so, the retries should reset and retrying should
+ // disable calling createBackup again
+ if (callCount == RETRIES_FOR_TEST + 1) {
+ Assert.equal(
+ Glean.browserBackup.backupThrottled.testGetValue().length,
+ 1,
+ "backupThrottled telemetry was sent"
+ );
+
+ Assert.ok(
+ Services.prefs.getBoolPref(DISABLED_ON_IDLE_RETRY_PREF_NAME),
+ "Disable on idle is now enabled - no more retries allowed"
+ );
+ }
+ // we expect createBackup to be called, but it shouldn't succeed
+ else {
+ Assert.equal(
+ BackupService.backoffSeconds(),
+ 2 * originalBackoffTime,
+ "Backoff time should have doubled"
+ );
+
+ Assert.ok(
+ !Services.prefs.getBoolPref(DISABLED_ON_IDLE_RETRY_PREF_NAME),
+ "Disable on idle is disabled - which means that we can do more retries!"
+ );
+
+ Assert.equal(
+ Glean.browserBackup.backupThrottled.testGetValue(),
+ null,
+ "backupThrottled telemetry was not sent yet"
+ );
+ }
+
+ Assert.equal(bs.createBackup.callCount, callCount, assertionMsg);
+
+ Assert.equal(
+ Services.prefs.getIntPref(BACKUP_ERROR_CODE_PREF_NAME),
+ ERRORS.UNKNOWN,
+ "Error code has been set"
+ );
+}
+
function bsInProgressStateUpdate(bs, isBackupInProgress) {
// Check if already in desired state
if (bs.state.backupInProgress === isBackupInProgress) {
@@ -47,7 +123,7 @@ add_setup(async () => {
TEST_PROFILE_PATH
);
Services.prefs.setBoolPref(SCHEDULED_BACKUPS_ENABLED_PREF_NAME, true);
- Services.prefs.setIntPref(BACKUP_RETRY_LIMIT_PREF_NAME, 2);
+ Services.prefs.setIntPref(BACKUP_RETRY_LIMIT_PREF_NAME, RETRIES_FOR_TEST);
Services.prefs.setBoolPref(DISABLED_ON_IDLE_RETRY_PREF_NAME, false);
setupProfile();
@@ -62,13 +138,13 @@ add_setup(async () => {
});
});
-add_task(async function test_retry_limit() {
+add_task(async function test_retries_no_backoff() {
Services.fog.testResetFOG();
let bs = new BackupService();
let sandbox = sinon.createSandbox();
// Make createBackup fail intentionally
- const createBackupFailureStub = sandbox
+ sandbox
.stub(bs, "resolveArchiveDestFolderPath")
.rejects(new BackupError("forced failure", ERRORS.UNKNOWN));
@@ -83,62 +159,53 @@ add_task(async function test_retry_limit() {
// ensure that there is no error code set
Services.prefs.setIntPref(BACKUP_ERROR_CODE_PREF_NAME, ERRORS.NONE);
- bs.createBackupOnIdleDispatch({});
-
- // #backupInProgress is set to true
- await bsInProgressStateUpdate(bs, true);
- // #backupInProgress is set to false
- await bsInProgressStateUpdate(bs, false);
-
- // wait a tick for the prefs to update
- await new Promise(executeSoon);
-
- Assert.equal(
- bs.createBackup.callCount,
- i + 1,
- "createBackup was called on idle"
- );
- Assert.equal(
- Services.prefs.getIntPref(BACKUP_ERROR_CODE_PREF_NAME),
- ERRORS.UNKNOWN,
- "Error code has been set"
+ // Set the lastBackupAttempt to the current backoff threshold, to avoid hitting
+ // the exponential backoff clause for this test.
+ Services.prefs.setStringPref(
+ BACKUP_DEBUG_INFO_PREF_NAME,
+ JSON.stringify({
+ lastBackupAttempt:
+ Math.floor(Date.now() / 1000) - (BackupService.backoffSeconds() + 1),
+ errorCode: ERRORS.UNKNOWN,
+ lastRunStep: 0,
+ })
);
- if (i < n) {
- Assert.equal(
- Glean.browserBackup.backupThrottled.testGetValue(),
- null,
- "backupThrottled telemetry was not sent yet"
- );
- } else {
- // On this call, createBackup _was_ called, but the next call will be
- // ignored. However, the telemetry ping is sent now.
- Assert.equal(
- Glean.browserBackup.backupThrottled.testGetValue().length,
- 1,
- "backupThrottled telemetry was sent"
- );
- }
+ await create_backup_failure_expected_calls(bs, i + 1);
}
// check if it switched to no longer creating backups on idle
- const previousCalls = bs.createBackup.callCount;
+ await create_backup_failure_expected_calls(
+ bs,
+ bs.createBackup.callCount,
+ "createBackup was not called since we hit the retry limit"
+ );
- bs.createBackupOnIdleDispatch({});
+ sandbox.restore();
+});
- // wait a tick for the pref to update
- await new Promise(executeSoon);
+add_task(async function test_exponential_backoff() {
+ Services.fog.testResetFOG();
- Assert.equal(
- bs.createBackup.callCount,
- previousCalls,
- "createBackup was not called again after hitting the retry limit"
- );
- Assert.ok(
- Services.prefs.getBoolPref(DISABLED_ON_IDLE_RETRY_PREF_NAME),
- "Disable on idle has been enabled"
+ let bs = new BackupService();
+ let sandbox = sinon.createSandbox();
+ const createBackupFailureStub = sandbox
+ .stub(bs, "resolveArchiveDestFolderPath")
+ .rejects(new BackupError("forced failure", ERRORS.UNKNOWN));
+
+ sandbox.stub(ChromeUtils, "idleDispatch").callsFake(callback => callback());
+ sandbox.spy(bs, "createBackup");
+
+ Services.prefs.setIntPref(BACKUP_ERROR_CODE_PREF_NAME, ERRORS.NONE);
+ Services.prefs.setStringPref(
+ BACKUP_DEBUG_INFO_PREF_NAME,
+ JSON.stringify({
+ lastBackupAttempt:
+ Math.floor(Date.now() / 1000) - (BackupService.backoffSeconds() + 1),
+ errorCode: ERRORS.UNKNOWN,
+ lastRunStep: 0,
+ })
);
- Services.fog.testResetFOG();
Services.prefs.setIntPref(MINIMUM_TIME_BETWEEN_BACKUPS_SECONDS_PREF_NAME, 0);
registerCleanupFunction(() => {
Services.prefs.clearUserPref(
@@ -146,36 +213,12 @@ add_task(async function test_retry_limit() {
);
});
- // add a buffer between lastAttempt and now
- // eslint-disable-next-line mozilla/no-arbitrary-setTimeout
- await new Promise(resolve => setTimeout(resolve, 10));
+ await create_backup_failure_expected_calls(bs, 1);
- bs.createBackupOnIdleDispatch({});
-
- // #backupInProgress is set to true
- await bsInProgressStateUpdate(bs, true);
-
- Assert.equal(
- bs.createBackup.callCount,
- previousCalls + 1,
- "createBackup was called again"
- );
-
- Assert.equal(
- Glean.browserBackup.backupThrottled.testGetValue(),
- null,
- "backupThrottled telemetry was not sent after resuming backups"
- );
-
- // #backupInProgress is set to false
- await bsInProgressStateUpdate(bs, false);
-
- // let's restore the failing function in createBackup
+ // Remove the stub, ensure that a success leads to the prefs
+ // and retries resetting
createBackupFailureStub.restore();
- // Now, on idleDispatch, we don't expect any failures
-
- // add a buffer between lastAttempt and now
// eslint-disable-next-line mozilla/no-arbitrary-setTimeout
await new Promise(resolve => setTimeout(resolve, 10));
@@ -184,30 +227,26 @@ add_task(async function test_retry_limit() {
"testBackup_profile"
);
- // calling createBackup directly to avoid race conditions with reliably
- // awaiting createBackup to finish from a call to idleDispatch. This case covers
- // the use case of reseting the prefs if createBackup is called and does not fail.
await bs.createBackup({
profilePath: testProfilePath,
});
- // the error states should have reset
- Assert.ok(
- !Services.prefs.getBoolPref(DISABLED_ON_IDLE_RETRY_PREF_NAME),
- "Retry on idle is enabled now"
- );
-
Assert.equal(
Services.prefs.getIntPref(BACKUP_ERROR_CODE_PREF_NAME),
ERRORS.NONE,
"The error code is reset to NONE"
);
+ Assert.equal(
+ 60,
+ BackupService.backoffSeconds(),
+ "The exponential backoff is reset to 1 minute (60s)"
+ );
+
Assert.ok(
!Services.prefs.getStringPref(BACKUP_DEBUG_INFO_PREF_NAME, null),
"Error debug info has been cleared"
);
- // check if changing the time since last backup calls createBackup
sandbox.restore();
});