tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

commit e30793607c53fbcd31bf2411944dfb36675d3e79
parent 29933f495e98918fb343238dde25f551489e2e5c
Author: Serban Stanca <sstanca@mozilla.com>
Date:   Wed, 10 Dec 2025 03:49:25 +0200

Revert "Bug 2000033, Bug 2000682: Integrate Glean MLTelemetry into the inference pipeline r=ai-ondevice-reviewers,gregtatum,rconcepcion" for causing perftests bustages.

This reverts commit 5989e4930b3768e4914e5414b88f93ec580a3fc7.

This reverts commit 416543f28ae08552f777d44acd259c087b3fb3aa.

Diffstat:
Mtoolkit/components/ml/MLTelemetry.sys.mjs | 151++++++++++++++++++++++---------------------------------------------------------
Mtoolkit/components/ml/actors/MLEngineParent.sys.mjs | 86++++++++++++++++++++++++++++++++++++++++++++-----------------------------------
Mtoolkit/components/ml/content/backends/ONNXPipeline.mjs | 253+++++++++++++++++++++++++------------------------------------------------------
Mtoolkit/components/ml/metrics.yaml | 29+----------------------------
Mtoolkit/components/ml/tests/browser/browser_ml_telemetry.js | 45++-------------------------------------------
5 files changed, 172 insertions(+), 392 deletions(-)

diff --git a/toolkit/components/ml/MLTelemetry.sys.mjs b/toolkit/components/ml/MLTelemetry.sys.mjs @@ -11,10 +11,6 @@ ChromeUtils.defineLazyGetter(lazy, "console", () => { }); }); -ChromeUtils.defineESModuleGetters(lazy, { - isAddonEngineId: "chrome://global/content/ml/Utils.sys.mjs", -}); - /** * MLTelemetry provides a mechanism tracking a "flow" of operations * related to a machine learning feature. A flow is a sequence of related @@ -74,20 +70,6 @@ export class MLTelemetry { } /** - * Returns the label used in telemetry for a given engine ID. - * Converts addon engine IDs to "webextension" label. - * - * @param {string} engineId - The engine ID to convert. - * @returns {string} The Glean label for the engine. - */ - static getGleanLabel(engineId) { - if (lazy.isAddonEngineId(engineId)) { - return "webextension"; - } - return engineId; - } - - /** * Starts a telemetry session for the given flow. * * @param {object} [options] - Session start options. @@ -167,12 +149,13 @@ export class MLTelemetry { * @param {object} options - Engine creation success options. * @param {string} [options.flowId] - The flow ID. Uses instance flowId if not provided. * @param {string} options.engineId - The engine identifier (e.g., "pdfjs", "ml-suggest-intent"). + * @param {string} [options.label] - Label for the old timing distribution metric. Defaults to engineId if not provided. * @param {number} options.duration - Engine creation time in milliseconds. */ - recordEngineCreationSuccessFlow({ flowId, engineId, duration }) { + recordEngineCreationSuccessFlow({ flowId, engineId, label, duration }) { const currentFlowId = flowId || this.#flowId; const actualEngineId = engineId; - const actualLabel = MLTelemetry.getGleanLabel(engineId); + const actualLabel = label || engineId; Glean.firefoxAiRuntime.engineCreationSuccessFlow.record({ flow_id: currentFlowId, @@ -202,7 +185,7 @@ export class MLTelemetry { * @param {string} options.featureId - The feature identifier. * @param {string} options.taskName - The task name. * @param {string} options.engineId - The engine identifier. - * @param {string|object} options.error - The error class/message or object. + * @param {string} options.error - The error class/message. */ recordEngineCreationFailure({ flowId, @@ -213,11 +196,6 @@ export class MLTelemetry { error, }) { const currentFlowId = flowId || this.#flowId; - // Ensure error is always a string - const errorString = - typeof error === "object" && error !== null - ? String(error.name || error.message || error) - : String(error); Glean.firefoxAiRuntime.engineCreationFailure.record({ flow_id: currentFlowId, @@ -225,7 +203,7 @@ export class MLTelemetry { featureId, taskName, engineId, - error: errorString, + error, }); this.logEventToConsole(this.recordEngineCreationFailure, { @@ -234,112 +212,67 @@ export class MLTelemetry { featureId, taskName, engineId, - error: errorString, + error, }); } /** * Records a successful inference run event. * - * @param {string} engineId - The engine identifier. - * @param {object} metrics - The inference metrics object. - * @param {number} [metrics.preprocessingTime] - Time spent preprocessing (legacy). - * @param {number} [metrics.tokenizingTime] - Time spent tokenizing in milliseconds. - * @param {number} [metrics.inferenceTime] - Time spent on inference in milliseconds. - * @param {number} [metrics.decodingTime] - Time spent decoding in milliseconds. - * @param {number} [metrics.inputTokens] - Number of input tokens. - * @param {number} [metrics.outputTokens] - Number of output tokens. - * @param {number} [metrics.timeToFirstToken] - Time to first token in milliseconds. - * @param {number} [metrics.tokensPerSecond] - Tokens per second. - * @param {number} [metrics.timePerOutputToken] - Time per output token in milliseconds. + * @param {object} options - Inference success options. + * @param {string} [options.flowId] - The flow ID. Uses instance flowId if not provided. + * @param {string} [options.engineId] - The engine identifier. Defaults to undefined. + * @param {string} [options.label] - Label for the old timing distribution metric. Defaults to no-label if not provided. + * @param {number} options.tokenizingTime - Time spent tokenizing in milliseconds. + * @param {number} options.inferenceTime - Time spent on inference in milliseconds. */ - recordRunInferenceSuccessFlow(engineId, metrics) { - try { - const currentFlowId = this.#flowId; - const EngineId = engineId || undefined; - const Label = engineId ? MLTelemetry.getGleanLabel(engineId) : "no-label"; - - // Handle legacy preprocessingTime field - const tokenizingTime = - metrics.preprocessingTime ?? metrics.tokenizingTime; - - // Ensure all metrics are properly rounded/typed for Glean - // This will be updated to use the method from revision(D271263) - const gleanPayload = { - flow_id: currentFlowId, - tokenizing_time: - tokenizingTime != null ? Math.round(tokenizingTime) : undefined, - inference_time: - metrics.inferenceTime != null - ? Math.round(metrics.inferenceTime) - : undefined, - decoding_time: - metrics.decodingTime != null - ? Math.round(metrics.decodingTime) - : undefined, - input_tokens: - metrics.inputTokens != null - ? Math.round(metrics.inputTokens) - : undefined, - output_tokens: - metrics.outputTokens != null - ? Math.round(metrics.outputTokens) - : undefined, - time_to_first_token: - metrics.timeToFirstToken != null - ? Math.round(metrics.timeToFirstToken) - : undefined, - tokens_per_second: - metrics.tokensPerSecond != null - ? Math.round(metrics.tokensPerSecond * 100) / 100 - : undefined, - time_per_output_token: - metrics.timePerOutputToken != null - ? Math.round(metrics.timePerOutputToken * 100) / 100 - : undefined, - }; - - Glean.firefoxAiRuntime.runInferenceSuccessFlow.record(gleanPayload); + recordRunInferenceSuccessFlow({ + flowId, + engineId, + label, + tokenizingTime, + inferenceTime, + }) { + const currentFlowId = flowId || this.#flowId; + const EngineId = engineId || undefined; + const Label = label || "no-label"; - // record the old labeled timing distribution metric - const totalTime = Math.round( - (tokenizingTime || 0) + - (metrics.inferenceTime || 0) + - (metrics.decodingTime || 0) - ); + Glean.firefoxAiRuntime.runInferenceSuccessFlow.record({ + flow_id: currentFlowId, + tokenizing_time: Math.round(tokenizingTime), + inference_time: Math.round(inferenceTime), + }); - Glean.firefoxAiRuntime.runInferenceSuccess[Label].accumulateSingleSample( - totalTime - ); + // Also record the old labeled timing distribution metric + const totalTime = Math.round(tokenizingTime + inferenceTime); + Glean.firefoxAiRuntime.runInferenceSuccess[Label].accumulateSingleSample( + totalTime + ); - this.logEventToConsole(this.recordRunInferenceSuccessFlow, { - ...gleanPayload, - engineId: EngineId, - label: Label, - }); - } catch (telemetryError) { - lazy.console.error("Failed to record ML telemetry:", telemetryError); - } + this.logEventToConsole(this.recordRunInferenceSuccessFlow, { + flowId: currentFlowId, + engineId: EngineId, + label: Label, + tokenizingTime, + inferenceTime, + }); } /** * Records a failed inference run event. * - * @param {string|object} error - The error class/message or object. + * @param {string} error - The error class/message. * @param {string} [flow_id=this.#flowId] - The flow ID. Uses instance flowId if not provided. */ recordRunInferenceFailure(error, flow_id = this.flowId) { - // Ensure error is always a string - const errorString = error instanceof Error ? error.message : String(error); - Glean.firefoxAiRuntime.runInferenceFailure.record({ flow_id, - error: errorString, + error, }); this.logEventToConsole(this.recordRunInferenceFailure, { flow_id, - error: errorString, + error, }); } diff --git a/toolkit/components/ml/actors/MLEngineParent.sys.mjs b/toolkit/components/ml/actors/MLEngineParent.sys.mjs @@ -2,7 +2,6 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs"; -import { MLTelemetry } from "chrome://global/content/ml/MLTelemetry.sys.mjs"; /** * @import { MLEngineChild } from "./MLEngineChild.sys.mjs" @@ -16,6 +15,7 @@ const lazy = XPCOMUtils.declareLazy({ clearTimeout: "resource://gre/modules/Timer.sys.mjs", ModelHub: "chrome://global/content/ml/ModelHub.sys.mjs", Progress: "chrome://global/content/ml/Utils.sys.mjs", + isAddonEngineId: "chrome://global/content/ml/Utils.sys.mjs", OPFS: "chrome://global/content/ml/OPFS.sys.mjs", BACKENDS: "chrome://global/content/ml/EngineProcess.sys.mjs", stringifyForLog: "chrome://global/content/ml/Utils.sys.mjs", @@ -222,7 +222,7 @@ export class MLEngineParent extends JSProcessActorParent { }); } - const { featureId, engineId } = pipelineOptions; + const engineId = pipelineOptions.engineId; // Allow notifications callback changes even when reusing engine. this.notificationsCallback = notificationsCallback; @@ -235,7 +235,6 @@ export class MLEngineParent extends JSProcessActorParent { Promise.withResolvers(); MLEngineParent.engineLocks.set(engineId, lockPromise); MLEngineParent.engineCreationAbortSignal.set(engineId, abortSignal); - try { const currentEngine = MLEngine.getInstance(engineId); if (currentEngine) { @@ -274,24 +273,12 @@ export class MLEngineParent extends JSProcessActorParent { const creationTime = ChromeUtils.now() - start; - engine.telemetry.recordEngineCreationSuccessFlow({ - engineId, - duration: creationTime, - }); + Glean.firefoxAiRuntime.engineCreationSuccess[ + engine.getGleanLabel() + ].accumulateSingleSample(creationTime); // TODO - What happens if the engine is already killed here? return engine; - } catch (error) { - const { modelId, taskName, flowId } = pipelineOptions; - const telemetry = new MLTelemetry({ featureId, flowId }); - telemetry.recordEngineCreationFailure({ - modelId, - featureId, - taskName, - engineId, - error, - }); - throw error; } finally { MLEngineParent.engineLocks.delete(engineId); MLEngineParent.engineCreationAbortSignal.delete(engineId); @@ -994,6 +981,18 @@ export class MLEngine { notificationsCallback = null; /** + * Returns the label used in telemetry for that engine id + * + * @returns {string} + */ + getGleanLabel() { + if (lazy.isAddonEngineId(this.engineId)) { + return "webextension"; + } + return this.engineId; + } + + /** * Removes an instance of the MLEngine with the given engineId. * * @param {string} engineId - The ID of the engine instance to be removed. @@ -1038,10 +1037,6 @@ export class MLEngine { this.mlEngineParent = mlEngineParent; this.pipelineOptions = pipelineOptions; this.notificationsCallback = notificationsCallback; - this.telemetry = new MLTelemetry({ - featureId: pipelineOptions.featureId, - flowId: pipelineOptions.flowId, - }); } /** @@ -1330,20 +1325,28 @@ export class MLEngine { const request = this.#requests.get(requestId); if (request) { if (error) { - this.telemetry.recordRunInferenceFailure(error); - request.reject(error); - } else if (response) { + Glean.firefoxAiRuntime.runInferenceFailure.record({ + engineId: this.engineId, + modelId: this.pipelineOptions.modelId, + featureId: this.pipelineOptions.featureId, + }); + } + if (response) { // Validate response before returning to caller const validatedResponse = this.#validateResponse(response); if (!validatedResponse) { request.reject(new Error("Response failed security validation")); } else { - this.telemetry.recordRunInferenceSuccessFlow( - this.engineId, - validatedResponse.metrics - ); + const totalTime = + validatedResponse.metrics.tokenizingTime + + validatedResponse.metrics.inferenceTime; + Glean.firefoxAiRuntime.runInferenceSuccess[ + this.getGleanLabel() + ].accumulateSingleSample(totalTime); request.resolve(validatedResponse); } + } else { + request.reject(error); } } else { lazy.console.error( @@ -1545,19 +1548,26 @@ export class MLEngine { (resourcesAfter.cpuTime - resourcesBefore.cpuTime) / 1_000_000; const wallMilliseconds = ChromeUtils.now() - beforeRun; const cores = lazy.mlUtils.getOptimalCPUConcurrency(); - const cpuUtilization = (cpuMilliseconds / wallMilliseconds / cores) * 100; + const cpuUtilization = cpuMilliseconds / wallMilliseconds / cores; const memoryBytes = resourcesAfter.memory; - this.telemetry.recordEngineRun({ - cpuMilliseconds, - wallMilliseconds, + const data = { + // Timing: + cpu_milliseconds: cpuMilliseconds, + wall_milliseconds: wallMilliseconds, cores, - cpuUtilization, - memoryBytes, - engineId: this.engineId, - modelId: this.pipelineOptions.modelId, + cpu_utilization: cpuUtilization, + memory_bytes: memoryBytes, + + // Model information: + engine_id: this.engineId, + model_id: this.pipelineOptions.modelId, + feature_id: this.pipelineOptions.featureId, backend: this.pipelineOptions.backend, - }); + }; + + lazy.console?.debug("[Glean.firefoxAiRuntime.engineRun]", data); + Glean.firefoxAiRuntime.engineRun.record(data); }); return resolvers.promise; diff --git a/toolkit/components/ml/content/backends/ONNXPipeline.mjs b/toolkit/components/ml/content/backends/ONNXPipeline.mjs @@ -9,20 +9,6 @@ * @typedef {import("../../content/Utils.sys.mjs").ProgressAndStatusCallbackParams} ProgressAndStatusCallbackParams */ -/** - * @typedef {object} PipelineMetrics - * @property {number} [preprocessingTime] - Time spent preprocessing inputs (ms). - * @property {number} [tokenizingTime] - Time spent tokenizing (same as preprocessingTime, for Glean consistency) (ms). - * @property {number} [inferenceTime] - Time spent running the model (ms). - * @property {number} [decodingTime] - Time spent decoding outputs (ms). - * @property {number} inputTokens - Number of tokens in the input. - * @property {number} outputTokens - Number of tokens in the output. - * @property {number|null} [timeToFirstToken] - Time to the first generated token (ms). - * @property {number} [tokensPerSecond] - Inference throughput (tokens/s). - * @property {number} [timePerOutputToken] - Latency per output token (ms). - * @property {Array<object>} [runTimestamps] - Timeline of execution events. - */ - /* eslint-disable-next-line mozilla/reject-import-system-module-from-non-system */ import { AppConstants } from "resource://gre/modules/AppConstants.sys.mjs"; @@ -77,7 +63,7 @@ let transformers = null; * @function importTransformers * @param {string} backend - The backend to use (e.g. "onnx-native" or "onnx"). * @returns {Promise<import("chrome://global/content/ml/transformers-dev.js")>} - * A promise that resolves once the Transformers library is imported. + * A promise that resolves once the Transformers library is imported. */ export async function importTransformers(backend) { if (transformers) { @@ -177,10 +163,7 @@ async function echo(request, _model, _tokenizer, _processor, config) { return { metrics: { - preprocessingTime: 0, - decodingTime: 0, - inputTokens: 0, - outputTokens: 0, + tokenizingTime: 0, }, output: result, }; @@ -206,16 +189,10 @@ async function imageToText(request, model, tokenizer, processor, _config) { let result = { metrics: { inferenceTime: 0, - preprocessingTime: 0, - decodingTime: 0, - inputTokens: null, - outputTokens: 0, + tokenizingTime: 0, }, }; - // Destructure to simplify assignments - const { metrics } = result; - - let startLoad = ChromeUtils.now(); + let start = Date.now(); let rawImage; if ("url" in request) { @@ -229,28 +206,27 @@ async function imageToText(request, model, tokenizer, processor, _config) { ); } - lazy.console.debug("Image loaded in ", ChromeUtils.now() - startLoad); + lazy.console.debug("Image loaded in ", Date.now() - start); - const startProcessing = ChromeUtils.now(); const { pixel_values } = await processor(rawImage); - metrics.preprocessingTime += ChromeUtils.now() - startProcessing; + result.metrics.tokenizingTime += Date.now() - start; const toReturn = []; const streamer = request.options?.streamer; for (const batch of pixel_values) { batch.dims = [1, ...batch.dims]; - const startInference = ChromeUtils.now(); + start = Date.now(); const output = await model.generate({ inputs: batch, streamer }); - metrics.inferenceTime += ChromeUtils.now() - startInference; - const startDecoding = ChromeUtils.now(); + result.metrics.inferenceTime += Date.now() - start; + start = Date.now(); const decoded = tokenizer .batch_decode(output, { skip_special_tokens: true, }) .map(x => ({ generated_text: x.trim() })); - metrics.decodingTime += ChromeUtils.now() - startDecoding; + result.metrics.tokenizingTime += Date.now() - start; toReturn.push(decoded); } - lazy.console.debug("Inference done in ", ChromeUtils.now() - startProcessing); + lazy.console.debug("Inference done in ", Date.now() - start); result.output = toReturn[0][0].generated_text; // Bug 1918220 - replace the result for models with that bug @@ -291,14 +267,11 @@ async function textToGoal( ) { const result = { metrics: { - preprocessingTime: 0, + tokenizingTime: 0, inferenceTime: 0, - inputTokens: 0, - outputTokens: 0, }, output: [], }; - const { metrics } = result; const texts = request.args?.[0] ?? []; const taskTypes = request.args?.[1] ?? []; // ["query", "page", ...] @@ -309,7 +282,7 @@ async function textToGoal( const task = taskTypes[i] ?? "query"; const domain = domains[i] ?? ""; - const startToken = ChromeUtils.now(); + const startToken = Date.now(); const encoded = await tokenizer(text, { padding: "max_length", @@ -317,7 +290,7 @@ async function textToGoal( max_length: 64, return_attention_mask: true, }); - metrics.preprocessingTime += ChromeUtils.now() - startToken; + result.metrics.tokenizingTime += Date.now() - startToken; const input_ids = encoded.input_ids.ort_tensor; const attention_mask = encoded.attention_mask.ort_tensor; const domain_vocab = modelConfig["transformers.js_config"].domain_vocab; @@ -345,11 +318,10 @@ async function textToGoal( task_type, }; - const startInfer = ChromeUtils.now(); + const startInfer = Date.now(); const session = model.sessions.model; const output = await session.run(inputs); - metrics.inferenceTime += ChromeUtils.now() - startInfer; - metrics.inputTokens += encoded.input_ids.ort_tensor.dims[1]; + result.metrics.inferenceTime += Date.now() - startInfer; result.output.push({ embedding: Array.from(output.embedding.data), @@ -649,7 +621,7 @@ export class ONNXPipeline { async #metricsSnapShot({ name, snapshot = {} }) { if (!("when" in snapshot)) { - snapshot.when = ChromeUtils.now(); + snapshot.when = Date.now(); } this.#metrics.push({ name, ...snapshot }); } @@ -667,7 +639,7 @@ export class ONNXPipeline { */ static async initialize(mlEngineWorker, runtime, options, errorFactory) { let snapShot = { - when: ChromeUtils.now(), + when: Date.now(), }; if (options.logLevel) { @@ -788,114 +760,86 @@ export class ONNXPipeline { async run(request, requestId, inferenceProgressCallback = null) { lazy.console.debug("Running task: ", this.#config.taskName); - /** @type {PipelineMetrics} */ - const metrics = { - inputTokens: 0, - outputTokens: 0, - preprocessingTime: 0, - tokenizingTime: 0, // Same as preprocessingTime, but named for Glean consistency - inferenceTime: 0, - decodingTime: 0, - timeToFirstToken: null, - tokensPerSecond: 0, - timePerOutputToken: 0, - runTimestamps: [], - }; - - /** - * Helper to record a timestamp in the metrics timeline. - * - * @param {string} name - */ - const snapshot = name => { - metrics.runTimestamps.push({ name, when: ChromeUtils.now() }); - }; - - const runStartTime = ChromeUtils.now(); - snapshot("runStart"); + let result; + await this.#metricsSnapShot({ name: "runStart" }); const tokenizer = this.#genericPipelineFunction?.tokenizer ?? this.#tokenizer; - if (this.#genericPipelineFunction && tokenizer && request.args?.[0]) { - try { - const inputs = [request.args[0]].flat(); - for (const text of inputs) { - if (typeof text === "string") { - const encoded = await tokenizer.encode(text); - metrics.inputTokens += encoded.length; - } - } - } catch (e) { - lazy.console.debug( - "Could not count input tokens for generic pipeline", - e - ); - } - } - const progressInfo = { ok: true, id: request.id ?? requestId, }; const streamerOptions = { - perTokens: true, + perTokens: false, skipPrompt: true, returnTokens: false, ...request.streamerOptions, }; - let streamer; + let streamer = undefined; let chunkTokens = []; - // Removed unused chunkText declaration here - - let firstTokenTimestamp = null; - if (tokenizer) { + let chunkText = ""; + let nextTokensArePrompt = !streamerOptions.skipPrompt; + let restoreTokenizer = false; + + if (tokenizer && inferenceProgressCallback) { + const flushPrompts = _tokens => { + streamer.token_cache = _tokens; + streamer.end(); + streamer.tokenizer = { + decode: () => { + streamer.token_cache = []; + return ""; + }, + }; + restoreTokenizer = true; + streamer.next_tokens_are_prompt = false; + }; streamer = new transformers.TextStreamer(tokenizer, { skip_prompt: streamerOptions.skipPrompt, decode_kwargs: { skip_special_tokens: true, }, token_callback_function: tokens => { - // Record Time To First Token on the very first callback - const now = ChromeUtils.now(); - if (metrics.timeToFirstToken === null) { - metrics.timeToFirstToken = now - runStartTime; - firstTokenTimestamp = now; - } - - metrics.outputTokens += tokens.length; - - // Only proceed with buffering if we have a callback to call - if (!inferenceProgressCallback) { - return; - } - - if (streamerOptions.perTokens) { - // Logic handled in callback_function - } else { - // Append newly received tokens. - chunkTokens.push(tokens); - } - }, - // Per-word (or per-token if perTokens=true) callback function - callback_function: text => { - if (!inferenceProgressCallback) { - return; + if (restoreTokenizer) { + streamer.tokenizer = tokenizer; + restoreTokenizer = false; } if (streamerOptions.perTokens) { + if (nextTokensArePrompt) { + flushPrompts(tokens); + } + inferenceProgressCallback({ ...progressInfo, metadata: { - text, - tokens: streamerOptions.returnTokens ? chunkTokens : null, + text: chunkText, + tokens: streamerOptions.returnTokens ? tokens : null, + isPrompt: nextTokensArePrompt, requestId, - isPrompt: false, // skipping prompt, so assumed false }, type: lazy.Progress.ProgressType.INFERENCE, statusText: lazy.Progress.ProgressStatusText.IN_PROGRESS, }); + + // We have sent the text, now resetting it + chunkText = ""; + } else { + // Append newly received tokens. + chunkTokens.push(tokens); + + if (nextTokensArePrompt) { + flushPrompts(tokens); + } + } + nextTokensArePrompt = false; + }, + // Per-word callback function + callback_function: text => { + if (streamerOptions.perTokens) { + chunkText = text; } else { inferenceProgressCallback({ ...progressInfo, @@ -903,7 +847,7 @@ export class ONNXPipeline { text, tokens: streamerOptions.returnTokens ? chunkTokens : null, requestId, - isPrompt: false, + isPrompt: nextTokensArePrompt, }, type: lazy.Progress.ProgressType.INFERENCE, statusText: lazy.Progress.ProgressStatusText.IN_PROGRESS, @@ -915,13 +859,13 @@ export class ONNXPipeline { }); } - // Inject streamer into request options - const requestWithCallback = { - ...request, - options: { ...request.options, streamer }, - }; - - let result; + // Override streamer in options + const requestWithCallback = inferenceProgressCallback + ? { + ...request, + options: { ...request.options, streamer }, + } + : request; if (this.#genericPipelineFunction) { if (this.#config.modelId === "test-echo") { @@ -929,19 +873,15 @@ export class ONNXPipeline { output: requestWithCallback.args, config: this.#config, multiThreadSupported: isMultiThreadSupported(), - metrics: { ...metrics }, }; } else { - const start = ChromeUtils.now(); - let output = await this.#genericPipelineFunction( + result = await this.#genericPipelineFunction( ...requestWithCallback.args, requestWithCallback.options || {} ); - metrics.inferenceTime = ChromeUtils.now() - start; - if (output instanceof transformers.Tensor) { - output = output.tolist(); + if (result instanceof transformers.Tensor) { + result = result.tolist(); } - result = output; } } else { result = await this.#pipelineFunction( @@ -952,44 +892,9 @@ export class ONNXPipeline { this.#config, this.#modelConfig ); - result.metrics ??= {}; - } - - if (result.metrics) { - for (const [key, value] of Object.entries(result.metrics)) { - if (value !== undefined && value !== null) { - metrics[key] = value; - } - } - } - - snapshot("runEnd"); - const runEndTime = ChromeUtils.now(); - - // Calculate metrics - try { - // If we streamed, decoding time is Time(End) - Time(FirstToken). - // Otherwise, we fallback to inferenceTime (e.g. for embeddings or image-to-text without streaming). - if (metrics.timeToFirstToken !== null && firstTokenTimestamp !== null) { - metrics.decodingTime = runEndTime - firstTokenTimestamp; - } else { - metrics.decodingTime = metrics.inferenceTime; - } - - // Sync tokenizingTime with preprocessingTime for Glean metrics consistency - metrics.tokenizingTime = metrics.preprocessingTime; - - // Calculate throughput metrics if we have the necessary data - if (metrics.inferenceTime > 0 && metrics.outputTokens > 0) { - metrics.tokensPerSecond = - metrics.outputTokens / (metrics.inferenceTime / 1000); - metrics.timePerOutputToken = - metrics.inferenceTime / metrics.outputTokens; - } - } catch (e) { - lazy.console.debug("Error computing throughput metrics", e); } - result.metrics = metrics; + await this.#metricsSnapShot({ name: "runEnd" }); + result.metrics = this.#metrics; if (streamer) { inferenceProgressCallback?.({ diff --git a/toolkit/components/ml/metrics.yaml b/toolkit/components/ml/metrics.yaml @@ -181,9 +181,6 @@ firefox.ai.runtime: engineId: description: Engine id type: string - error: - description: error class or message - type: string run_inference_success_flow: type: event @@ -203,35 +200,11 @@ firefox.ai.runtime: tokenizing_time: type: quantity description: > - Time taken for tokenization (preprocessing) in milliseconds + Time taken for tokenization in milliseconds inference_time: type: quantity description: > Time taken for inference in milliseconds - decoding_time: - type: quantity - description: > - Time taken for decoding in milliseconds - input_tokens: - type: quantity - description: > - Number of tokens in the input prompt - output_tokens: - type: quantity - description: > - Number of tokens generated - time_to_first_token: - type: quantity - description: > - Time to first token in milliseconds (null if not applicable) - tokens_per_second: - type: quantity - description: > - Inference throughput in tokens per second (rounded integer) - time_per_output_token: - type: quantity - description: > - Average latency per output token in milliseconds (rounded integer) run_inference_success: type: labeled_timing_distribution diff --git a/toolkit/components/ml/tests/browser/browser_ml_telemetry.js b/toolkit/components/ml/tests/browser/browser_ml_telemetry.js @@ -58,52 +58,11 @@ add_task(async function test_default_telemetry() { ); { - info("Test the run_inference_success_flow event"); - const inferenceFlowEvents = - Glean.firefoxAiRuntime.runInferenceSuccessFlow.testGetValue(); - Assert.ok( - inferenceFlowEvents && !!inferenceFlowEvents.length, - "At least one run_inference_success_flow event was recorded" - ); - const lastInferenceEvent = inferenceFlowEvents.at(-1); - const { extra: inferenceExtra } = lastInferenceEvent; - - // Helper to check that a number field is present and >= 0 - const checkNumber = (key, isOptional = false) => { - const value = inferenceExtra[key]; - if (isOptional && (value === null || value === undefined)) { - return; // Optional field not present is OK - } - Assert.notEqual(value, null, `${key} should be present`); - const number = Number(value); // Quantities are stored as strings - Assert.ok(!Number.isNaN(number), `${key} should be a number`); - Assert.greaterOrEqual(number, 0, `${key} should be >= 0`); - }; - - // Check flow_id is present - Assert.ok(inferenceExtra.flow_id, "flow_id should be present"); - - // Check all required timing/token metrics - checkNumber("tokenizing_time", true); - checkNumber("inference_time", true); - checkNumber("decoding_time", true); - checkNumber("input_tokens", true); - checkNumber("output_tokens", true); - checkNumber("time_to_first_token", true); - checkNumber("tokens_per_second", true); - checkNumber("time_per_output_token", true); - } - - { info("Test the engine_run event"); await engineInstance.lastResourceRequest; const value = Glean.firefoxAiRuntime.engineRun.testGetValue(); - Assert.ok( - value && !!value.length, - "At least one engine_run event was recorded" - ); - const lastEngineRunEvent = value.at(-1); - const { extra } = lastEngineRunEvent; + Assert.equal(value?.length, 1, "One engine_run event was recorded"); + const [{ extra }] = value; const checkNumber = key => { const value = extra[key]; Assert.notEqual(value, null, `${key} should be present`);