commit e30793607c53fbcd31bf2411944dfb36675d3e79
parent 29933f495e98918fb343238dde25f551489e2e5c
Author: Serban Stanca <sstanca@mozilla.com>
Date: Wed, 10 Dec 2025 03:49:25 +0200
Revert "Bug 2000033, Bug 2000682: Integrate Glean MLTelemetry into the inference pipeline r=ai-ondevice-reviewers,gregtatum,rconcepcion" for causing perftests bustages.
This reverts commit 5989e4930b3768e4914e5414b88f93ec580a3fc7.
This reverts commit 416543f28ae08552f777d44acd259c087b3fb3aa.
Diffstat:
5 files changed, 172 insertions(+), 392 deletions(-)
diff --git a/toolkit/components/ml/MLTelemetry.sys.mjs b/toolkit/components/ml/MLTelemetry.sys.mjs
@@ -11,10 +11,6 @@ ChromeUtils.defineLazyGetter(lazy, "console", () => {
});
});
-ChromeUtils.defineESModuleGetters(lazy, {
- isAddonEngineId: "chrome://global/content/ml/Utils.sys.mjs",
-});
-
/**
* MLTelemetry provides a mechanism tracking a "flow" of operations
* related to a machine learning feature. A flow is a sequence of related
@@ -74,20 +70,6 @@ export class MLTelemetry {
}
/**
- * Returns the label used in telemetry for a given engine ID.
- * Converts addon engine IDs to "webextension" label.
- *
- * @param {string} engineId - The engine ID to convert.
- * @returns {string} The Glean label for the engine.
- */
- static getGleanLabel(engineId) {
- if (lazy.isAddonEngineId(engineId)) {
- return "webextension";
- }
- return engineId;
- }
-
- /**
* Starts a telemetry session for the given flow.
*
* @param {object} [options] - Session start options.
@@ -167,12 +149,13 @@ export class MLTelemetry {
* @param {object} options - Engine creation success options.
* @param {string} [options.flowId] - The flow ID. Uses instance flowId if not provided.
* @param {string} options.engineId - The engine identifier (e.g., "pdfjs", "ml-suggest-intent").
+ * @param {string} [options.label] - Label for the old timing distribution metric. Defaults to engineId if not provided.
* @param {number} options.duration - Engine creation time in milliseconds.
*/
- recordEngineCreationSuccessFlow({ flowId, engineId, duration }) {
+ recordEngineCreationSuccessFlow({ flowId, engineId, label, duration }) {
const currentFlowId = flowId || this.#flowId;
const actualEngineId = engineId;
- const actualLabel = MLTelemetry.getGleanLabel(engineId);
+ const actualLabel = label || engineId;
Glean.firefoxAiRuntime.engineCreationSuccessFlow.record({
flow_id: currentFlowId,
@@ -202,7 +185,7 @@ export class MLTelemetry {
* @param {string} options.featureId - The feature identifier.
* @param {string} options.taskName - The task name.
* @param {string} options.engineId - The engine identifier.
- * @param {string|object} options.error - The error class/message or object.
+ * @param {string} options.error - The error class/message.
*/
recordEngineCreationFailure({
flowId,
@@ -213,11 +196,6 @@ export class MLTelemetry {
error,
}) {
const currentFlowId = flowId || this.#flowId;
- // Ensure error is always a string
- const errorString =
- typeof error === "object" && error !== null
- ? String(error.name || error.message || error)
- : String(error);
Glean.firefoxAiRuntime.engineCreationFailure.record({
flow_id: currentFlowId,
@@ -225,7 +203,7 @@ export class MLTelemetry {
featureId,
taskName,
engineId,
- error: errorString,
+ error,
});
this.logEventToConsole(this.recordEngineCreationFailure, {
@@ -234,112 +212,67 @@ export class MLTelemetry {
featureId,
taskName,
engineId,
- error: errorString,
+ error,
});
}
/**
* Records a successful inference run event.
*
- * @param {string} engineId - The engine identifier.
- * @param {object} metrics - The inference metrics object.
- * @param {number} [metrics.preprocessingTime] - Time spent preprocessing (legacy).
- * @param {number} [metrics.tokenizingTime] - Time spent tokenizing in milliseconds.
- * @param {number} [metrics.inferenceTime] - Time spent on inference in milliseconds.
- * @param {number} [metrics.decodingTime] - Time spent decoding in milliseconds.
- * @param {number} [metrics.inputTokens] - Number of input tokens.
- * @param {number} [metrics.outputTokens] - Number of output tokens.
- * @param {number} [metrics.timeToFirstToken] - Time to first token in milliseconds.
- * @param {number} [metrics.tokensPerSecond] - Tokens per second.
- * @param {number} [metrics.timePerOutputToken] - Time per output token in milliseconds.
+ * @param {object} options - Inference success options.
+ * @param {string} [options.flowId] - The flow ID. Uses instance flowId if not provided.
+ * @param {string} [options.engineId] - The engine identifier. Defaults to undefined.
+ * @param {string} [options.label] - Label for the old timing distribution metric. Defaults to no-label if not provided.
+ * @param {number} options.tokenizingTime - Time spent tokenizing in milliseconds.
+ * @param {number} options.inferenceTime - Time spent on inference in milliseconds.
*/
- recordRunInferenceSuccessFlow(engineId, metrics) {
- try {
- const currentFlowId = this.#flowId;
- const EngineId = engineId || undefined;
- const Label = engineId ? MLTelemetry.getGleanLabel(engineId) : "no-label";
-
- // Handle legacy preprocessingTime field
- const tokenizingTime =
- metrics.preprocessingTime ?? metrics.tokenizingTime;
-
- // Ensure all metrics are properly rounded/typed for Glean
- // This will be updated to use the method from revision(D271263)
- const gleanPayload = {
- flow_id: currentFlowId,
- tokenizing_time:
- tokenizingTime != null ? Math.round(tokenizingTime) : undefined,
- inference_time:
- metrics.inferenceTime != null
- ? Math.round(metrics.inferenceTime)
- : undefined,
- decoding_time:
- metrics.decodingTime != null
- ? Math.round(metrics.decodingTime)
- : undefined,
- input_tokens:
- metrics.inputTokens != null
- ? Math.round(metrics.inputTokens)
- : undefined,
- output_tokens:
- metrics.outputTokens != null
- ? Math.round(metrics.outputTokens)
- : undefined,
- time_to_first_token:
- metrics.timeToFirstToken != null
- ? Math.round(metrics.timeToFirstToken)
- : undefined,
- tokens_per_second:
- metrics.tokensPerSecond != null
- ? Math.round(metrics.tokensPerSecond * 100) / 100
- : undefined,
- time_per_output_token:
- metrics.timePerOutputToken != null
- ? Math.round(metrics.timePerOutputToken * 100) / 100
- : undefined,
- };
-
- Glean.firefoxAiRuntime.runInferenceSuccessFlow.record(gleanPayload);
+ recordRunInferenceSuccessFlow({
+ flowId,
+ engineId,
+ label,
+ tokenizingTime,
+ inferenceTime,
+ }) {
+ const currentFlowId = flowId || this.#flowId;
+ const EngineId = engineId || undefined;
+ const Label = label || "no-label";
- // record the old labeled timing distribution metric
- const totalTime = Math.round(
- (tokenizingTime || 0) +
- (metrics.inferenceTime || 0) +
- (metrics.decodingTime || 0)
- );
+ Glean.firefoxAiRuntime.runInferenceSuccessFlow.record({
+ flow_id: currentFlowId,
+ tokenizing_time: Math.round(tokenizingTime),
+ inference_time: Math.round(inferenceTime),
+ });
- Glean.firefoxAiRuntime.runInferenceSuccess[Label].accumulateSingleSample(
- totalTime
- );
+ // Also record the old labeled timing distribution metric
+ const totalTime = Math.round(tokenizingTime + inferenceTime);
+ Glean.firefoxAiRuntime.runInferenceSuccess[Label].accumulateSingleSample(
+ totalTime
+ );
- this.logEventToConsole(this.recordRunInferenceSuccessFlow, {
- ...gleanPayload,
- engineId: EngineId,
- label: Label,
- });
- } catch (telemetryError) {
- lazy.console.error("Failed to record ML telemetry:", telemetryError);
- }
+ this.logEventToConsole(this.recordRunInferenceSuccessFlow, {
+ flowId: currentFlowId,
+ engineId: EngineId,
+ label: Label,
+ tokenizingTime,
+ inferenceTime,
+ });
}
/**
* Records a failed inference run event.
*
- * @param {string|object} error - The error class/message or object.
+ * @param {string} error - The error class/message.
* @param {string} [flow_id=this.#flowId] - The flow ID. Uses instance flowId if not provided.
*/
recordRunInferenceFailure(error, flow_id = this.flowId) {
- // Ensure error is always a string
- const errorString = error instanceof Error ? error.message : String(error);
-
Glean.firefoxAiRuntime.runInferenceFailure.record({
flow_id,
- error: errorString,
+ error,
});
this.logEventToConsole(this.recordRunInferenceFailure, {
flow_id,
- error: errorString,
+ error,
});
}
diff --git a/toolkit/components/ml/actors/MLEngineParent.sys.mjs b/toolkit/components/ml/actors/MLEngineParent.sys.mjs
@@ -2,7 +2,6 @@
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs";
-import { MLTelemetry } from "chrome://global/content/ml/MLTelemetry.sys.mjs";
/**
* @import { MLEngineChild } from "./MLEngineChild.sys.mjs"
@@ -16,6 +15,7 @@ const lazy = XPCOMUtils.declareLazy({
clearTimeout: "resource://gre/modules/Timer.sys.mjs",
ModelHub: "chrome://global/content/ml/ModelHub.sys.mjs",
Progress: "chrome://global/content/ml/Utils.sys.mjs",
+ isAddonEngineId: "chrome://global/content/ml/Utils.sys.mjs",
OPFS: "chrome://global/content/ml/OPFS.sys.mjs",
BACKENDS: "chrome://global/content/ml/EngineProcess.sys.mjs",
stringifyForLog: "chrome://global/content/ml/Utils.sys.mjs",
@@ -222,7 +222,7 @@ export class MLEngineParent extends JSProcessActorParent {
});
}
- const { featureId, engineId } = pipelineOptions;
+ const engineId = pipelineOptions.engineId;
// Allow notifications callback changes even when reusing engine.
this.notificationsCallback = notificationsCallback;
@@ -235,7 +235,6 @@ export class MLEngineParent extends JSProcessActorParent {
Promise.withResolvers();
MLEngineParent.engineLocks.set(engineId, lockPromise);
MLEngineParent.engineCreationAbortSignal.set(engineId, abortSignal);
-
try {
const currentEngine = MLEngine.getInstance(engineId);
if (currentEngine) {
@@ -274,24 +273,12 @@ export class MLEngineParent extends JSProcessActorParent {
const creationTime = ChromeUtils.now() - start;
- engine.telemetry.recordEngineCreationSuccessFlow({
- engineId,
- duration: creationTime,
- });
+ Glean.firefoxAiRuntime.engineCreationSuccess[
+ engine.getGleanLabel()
+ ].accumulateSingleSample(creationTime);
// TODO - What happens if the engine is already killed here?
return engine;
- } catch (error) {
- const { modelId, taskName, flowId } = pipelineOptions;
- const telemetry = new MLTelemetry({ featureId, flowId });
- telemetry.recordEngineCreationFailure({
- modelId,
- featureId,
- taskName,
- engineId,
- error,
- });
- throw error;
} finally {
MLEngineParent.engineLocks.delete(engineId);
MLEngineParent.engineCreationAbortSignal.delete(engineId);
@@ -994,6 +981,18 @@ export class MLEngine {
notificationsCallback = null;
/**
+ * Returns the label used in telemetry for that engine id
+ *
+ * @returns {string}
+ */
+ getGleanLabel() {
+ if (lazy.isAddonEngineId(this.engineId)) {
+ return "webextension";
+ }
+ return this.engineId;
+ }
+
+ /**
* Removes an instance of the MLEngine with the given engineId.
*
* @param {string} engineId - The ID of the engine instance to be removed.
@@ -1038,10 +1037,6 @@ export class MLEngine {
this.mlEngineParent = mlEngineParent;
this.pipelineOptions = pipelineOptions;
this.notificationsCallback = notificationsCallback;
- this.telemetry = new MLTelemetry({
- featureId: pipelineOptions.featureId,
- flowId: pipelineOptions.flowId,
- });
}
/**
@@ -1330,20 +1325,28 @@ export class MLEngine {
const request = this.#requests.get(requestId);
if (request) {
if (error) {
- this.telemetry.recordRunInferenceFailure(error);
- request.reject(error);
- } else if (response) {
+ Glean.firefoxAiRuntime.runInferenceFailure.record({
+ engineId: this.engineId,
+ modelId: this.pipelineOptions.modelId,
+ featureId: this.pipelineOptions.featureId,
+ });
+ }
+ if (response) {
// Validate response before returning to caller
const validatedResponse = this.#validateResponse(response);
if (!validatedResponse) {
request.reject(new Error("Response failed security validation"));
} else {
- this.telemetry.recordRunInferenceSuccessFlow(
- this.engineId,
- validatedResponse.metrics
- );
+ const totalTime =
+ validatedResponse.metrics.tokenizingTime +
+ validatedResponse.metrics.inferenceTime;
+ Glean.firefoxAiRuntime.runInferenceSuccess[
+ this.getGleanLabel()
+ ].accumulateSingleSample(totalTime);
request.resolve(validatedResponse);
}
+ } else {
+ request.reject(error);
}
} else {
lazy.console.error(
@@ -1545,19 +1548,26 @@ export class MLEngine {
(resourcesAfter.cpuTime - resourcesBefore.cpuTime) / 1_000_000;
const wallMilliseconds = ChromeUtils.now() - beforeRun;
const cores = lazy.mlUtils.getOptimalCPUConcurrency();
- const cpuUtilization = (cpuMilliseconds / wallMilliseconds / cores) * 100;
+ const cpuUtilization = cpuMilliseconds / wallMilliseconds / cores;
const memoryBytes = resourcesAfter.memory;
- this.telemetry.recordEngineRun({
- cpuMilliseconds,
- wallMilliseconds,
+ const data = {
+ // Timing:
+ cpu_milliseconds: cpuMilliseconds,
+ wall_milliseconds: wallMilliseconds,
cores,
- cpuUtilization,
- memoryBytes,
- engineId: this.engineId,
- modelId: this.pipelineOptions.modelId,
+ cpu_utilization: cpuUtilization,
+ memory_bytes: memoryBytes,
+
+ // Model information:
+ engine_id: this.engineId,
+ model_id: this.pipelineOptions.modelId,
+ feature_id: this.pipelineOptions.featureId,
backend: this.pipelineOptions.backend,
- });
+ };
+
+ lazy.console?.debug("[Glean.firefoxAiRuntime.engineRun]", data);
+ Glean.firefoxAiRuntime.engineRun.record(data);
});
return resolvers.promise;
diff --git a/toolkit/components/ml/content/backends/ONNXPipeline.mjs b/toolkit/components/ml/content/backends/ONNXPipeline.mjs
@@ -9,20 +9,6 @@
* @typedef {import("../../content/Utils.sys.mjs").ProgressAndStatusCallbackParams} ProgressAndStatusCallbackParams
*/
-/**
- * @typedef {object} PipelineMetrics
- * @property {number} [preprocessingTime] - Time spent preprocessing inputs (ms).
- * @property {number} [tokenizingTime] - Time spent tokenizing (same as preprocessingTime, for Glean consistency) (ms).
- * @property {number} [inferenceTime] - Time spent running the model (ms).
- * @property {number} [decodingTime] - Time spent decoding outputs (ms).
- * @property {number} inputTokens - Number of tokens in the input.
- * @property {number} outputTokens - Number of tokens in the output.
- * @property {number|null} [timeToFirstToken] - Time to the first generated token (ms).
- * @property {number} [tokensPerSecond] - Inference throughput (tokens/s).
- * @property {number} [timePerOutputToken] - Latency per output token (ms).
- * @property {Array<object>} [runTimestamps] - Timeline of execution events.
- */
-
/* eslint-disable-next-line mozilla/reject-import-system-module-from-non-system */
import { AppConstants } from "resource://gre/modules/AppConstants.sys.mjs";
@@ -77,7 +63,7 @@ let transformers = null;
* @function importTransformers
* @param {string} backend - The backend to use (e.g. "onnx-native" or "onnx").
* @returns {Promise<import("chrome://global/content/ml/transformers-dev.js")>}
- * A promise that resolves once the Transformers library is imported.
+ * A promise that resolves once the Transformers library is imported.
*/
export async function importTransformers(backend) {
if (transformers) {
@@ -177,10 +163,7 @@ async function echo(request, _model, _tokenizer, _processor, config) {
return {
metrics: {
- preprocessingTime: 0,
- decodingTime: 0,
- inputTokens: 0,
- outputTokens: 0,
+ tokenizingTime: 0,
},
output: result,
};
@@ -206,16 +189,10 @@ async function imageToText(request, model, tokenizer, processor, _config) {
let result = {
metrics: {
inferenceTime: 0,
- preprocessingTime: 0,
- decodingTime: 0,
- inputTokens: null,
- outputTokens: 0,
+ tokenizingTime: 0,
},
};
- // Destructure to simplify assignments
- const { metrics } = result;
-
- let startLoad = ChromeUtils.now();
+ let start = Date.now();
let rawImage;
if ("url" in request) {
@@ -229,28 +206,27 @@ async function imageToText(request, model, tokenizer, processor, _config) {
);
}
- lazy.console.debug("Image loaded in ", ChromeUtils.now() - startLoad);
+ lazy.console.debug("Image loaded in ", Date.now() - start);
- const startProcessing = ChromeUtils.now();
const { pixel_values } = await processor(rawImage);
- metrics.preprocessingTime += ChromeUtils.now() - startProcessing;
+ result.metrics.tokenizingTime += Date.now() - start;
const toReturn = [];
const streamer = request.options?.streamer;
for (const batch of pixel_values) {
batch.dims = [1, ...batch.dims];
- const startInference = ChromeUtils.now();
+ start = Date.now();
const output = await model.generate({ inputs: batch, streamer });
- metrics.inferenceTime += ChromeUtils.now() - startInference;
- const startDecoding = ChromeUtils.now();
+ result.metrics.inferenceTime += Date.now() - start;
+ start = Date.now();
const decoded = tokenizer
.batch_decode(output, {
skip_special_tokens: true,
})
.map(x => ({ generated_text: x.trim() }));
- metrics.decodingTime += ChromeUtils.now() - startDecoding;
+ result.metrics.tokenizingTime += Date.now() - start;
toReturn.push(decoded);
}
- lazy.console.debug("Inference done in ", ChromeUtils.now() - startProcessing);
+ lazy.console.debug("Inference done in ", Date.now() - start);
result.output = toReturn[0][0].generated_text;
// Bug 1918220 - replace the result for models with that bug
@@ -291,14 +267,11 @@ async function textToGoal(
) {
const result = {
metrics: {
- preprocessingTime: 0,
+ tokenizingTime: 0,
inferenceTime: 0,
- inputTokens: 0,
- outputTokens: 0,
},
output: [],
};
- const { metrics } = result;
const texts = request.args?.[0] ?? [];
const taskTypes = request.args?.[1] ?? []; // ["query", "page", ...]
@@ -309,7 +282,7 @@ async function textToGoal(
const task = taskTypes[i] ?? "query";
const domain = domains[i] ?? "";
- const startToken = ChromeUtils.now();
+ const startToken = Date.now();
const encoded = await tokenizer(text, {
padding: "max_length",
@@ -317,7 +290,7 @@ async function textToGoal(
max_length: 64,
return_attention_mask: true,
});
- metrics.preprocessingTime += ChromeUtils.now() - startToken;
+ result.metrics.tokenizingTime += Date.now() - startToken;
const input_ids = encoded.input_ids.ort_tensor;
const attention_mask = encoded.attention_mask.ort_tensor;
const domain_vocab = modelConfig["transformers.js_config"].domain_vocab;
@@ -345,11 +318,10 @@ async function textToGoal(
task_type,
};
- const startInfer = ChromeUtils.now();
+ const startInfer = Date.now();
const session = model.sessions.model;
const output = await session.run(inputs);
- metrics.inferenceTime += ChromeUtils.now() - startInfer;
- metrics.inputTokens += encoded.input_ids.ort_tensor.dims[1];
+ result.metrics.inferenceTime += Date.now() - startInfer;
result.output.push({
embedding: Array.from(output.embedding.data),
@@ -649,7 +621,7 @@ export class ONNXPipeline {
async #metricsSnapShot({ name, snapshot = {} }) {
if (!("when" in snapshot)) {
- snapshot.when = ChromeUtils.now();
+ snapshot.when = Date.now();
}
this.#metrics.push({ name, ...snapshot });
}
@@ -667,7 +639,7 @@ export class ONNXPipeline {
*/
static async initialize(mlEngineWorker, runtime, options, errorFactory) {
let snapShot = {
- when: ChromeUtils.now(),
+ when: Date.now(),
};
if (options.logLevel) {
@@ -788,114 +760,86 @@ export class ONNXPipeline {
async run(request, requestId, inferenceProgressCallback = null) {
lazy.console.debug("Running task: ", this.#config.taskName);
- /** @type {PipelineMetrics} */
- const metrics = {
- inputTokens: 0,
- outputTokens: 0,
- preprocessingTime: 0,
- tokenizingTime: 0, // Same as preprocessingTime, but named for Glean consistency
- inferenceTime: 0,
- decodingTime: 0,
- timeToFirstToken: null,
- tokensPerSecond: 0,
- timePerOutputToken: 0,
- runTimestamps: [],
- };
-
- /**
- * Helper to record a timestamp in the metrics timeline.
- *
- * @param {string} name
- */
- const snapshot = name => {
- metrics.runTimestamps.push({ name, when: ChromeUtils.now() });
- };
-
- const runStartTime = ChromeUtils.now();
- snapshot("runStart");
+ let result;
+ await this.#metricsSnapShot({ name: "runStart" });
const tokenizer =
this.#genericPipelineFunction?.tokenizer ?? this.#tokenizer;
- if (this.#genericPipelineFunction && tokenizer && request.args?.[0]) {
- try {
- const inputs = [request.args[0]].flat();
- for (const text of inputs) {
- if (typeof text === "string") {
- const encoded = await tokenizer.encode(text);
- metrics.inputTokens += encoded.length;
- }
- }
- } catch (e) {
- lazy.console.debug(
- "Could not count input tokens for generic pipeline",
- e
- );
- }
- }
-
const progressInfo = {
ok: true,
id: request.id ?? requestId,
};
const streamerOptions = {
- perTokens: true,
+ perTokens: false,
skipPrompt: true,
returnTokens: false,
...request.streamerOptions,
};
- let streamer;
+ let streamer = undefined;
let chunkTokens = [];
- // Removed unused chunkText declaration here
-
- let firstTokenTimestamp = null;
- if (tokenizer) {
+ let chunkText = "";
+ let nextTokensArePrompt = !streamerOptions.skipPrompt;
+ let restoreTokenizer = false;
+
+ if (tokenizer && inferenceProgressCallback) {
+ const flushPrompts = _tokens => {
+ streamer.token_cache = _tokens;
+ streamer.end();
+ streamer.tokenizer = {
+ decode: () => {
+ streamer.token_cache = [];
+ return "";
+ },
+ };
+ restoreTokenizer = true;
+ streamer.next_tokens_are_prompt = false;
+ };
streamer = new transformers.TextStreamer(tokenizer, {
skip_prompt: streamerOptions.skipPrompt,
decode_kwargs: {
skip_special_tokens: true,
},
token_callback_function: tokens => {
- // Record Time To First Token on the very first callback
- const now = ChromeUtils.now();
- if (metrics.timeToFirstToken === null) {
- metrics.timeToFirstToken = now - runStartTime;
- firstTokenTimestamp = now;
- }
-
- metrics.outputTokens += tokens.length;
-
- // Only proceed with buffering if we have a callback to call
- if (!inferenceProgressCallback) {
- return;
- }
-
- if (streamerOptions.perTokens) {
- // Logic handled in callback_function
- } else {
- // Append newly received tokens.
- chunkTokens.push(tokens);
- }
- },
- // Per-word (or per-token if perTokens=true) callback function
- callback_function: text => {
- if (!inferenceProgressCallback) {
- return;
+ if (restoreTokenizer) {
+ streamer.tokenizer = tokenizer;
+ restoreTokenizer = false;
}
if (streamerOptions.perTokens) {
+ if (nextTokensArePrompt) {
+ flushPrompts(tokens);
+ }
+
inferenceProgressCallback({
...progressInfo,
metadata: {
- text,
- tokens: streamerOptions.returnTokens ? chunkTokens : null,
+ text: chunkText,
+ tokens: streamerOptions.returnTokens ? tokens : null,
+ isPrompt: nextTokensArePrompt,
requestId,
- isPrompt: false, // skipping prompt, so assumed false
},
type: lazy.Progress.ProgressType.INFERENCE,
statusText: lazy.Progress.ProgressStatusText.IN_PROGRESS,
});
+
+ // We have sent the text, now resetting it
+ chunkText = "";
+ } else {
+ // Append newly received tokens.
+ chunkTokens.push(tokens);
+
+ if (nextTokensArePrompt) {
+ flushPrompts(tokens);
+ }
+ }
+ nextTokensArePrompt = false;
+ },
+ // Per-word callback function
+ callback_function: text => {
+ if (streamerOptions.perTokens) {
+ chunkText = text;
} else {
inferenceProgressCallback({
...progressInfo,
@@ -903,7 +847,7 @@ export class ONNXPipeline {
text,
tokens: streamerOptions.returnTokens ? chunkTokens : null,
requestId,
- isPrompt: false,
+ isPrompt: nextTokensArePrompt,
},
type: lazy.Progress.ProgressType.INFERENCE,
statusText: lazy.Progress.ProgressStatusText.IN_PROGRESS,
@@ -915,13 +859,13 @@ export class ONNXPipeline {
});
}
- // Inject streamer into request options
- const requestWithCallback = {
- ...request,
- options: { ...request.options, streamer },
- };
-
- let result;
+ // Override streamer in options
+ const requestWithCallback = inferenceProgressCallback
+ ? {
+ ...request,
+ options: { ...request.options, streamer },
+ }
+ : request;
if (this.#genericPipelineFunction) {
if (this.#config.modelId === "test-echo") {
@@ -929,19 +873,15 @@ export class ONNXPipeline {
output: requestWithCallback.args,
config: this.#config,
multiThreadSupported: isMultiThreadSupported(),
- metrics: { ...metrics },
};
} else {
- const start = ChromeUtils.now();
- let output = await this.#genericPipelineFunction(
+ result = await this.#genericPipelineFunction(
...requestWithCallback.args,
requestWithCallback.options || {}
);
- metrics.inferenceTime = ChromeUtils.now() - start;
- if (output instanceof transformers.Tensor) {
- output = output.tolist();
+ if (result instanceof transformers.Tensor) {
+ result = result.tolist();
}
- result = output;
}
} else {
result = await this.#pipelineFunction(
@@ -952,44 +892,9 @@ export class ONNXPipeline {
this.#config,
this.#modelConfig
);
- result.metrics ??= {};
- }
-
- if (result.metrics) {
- for (const [key, value] of Object.entries(result.metrics)) {
- if (value !== undefined && value !== null) {
- metrics[key] = value;
- }
- }
- }
-
- snapshot("runEnd");
- const runEndTime = ChromeUtils.now();
-
- // Calculate metrics
- try {
- // If we streamed, decoding time is Time(End) - Time(FirstToken).
- // Otherwise, we fallback to inferenceTime (e.g. for embeddings or image-to-text without streaming).
- if (metrics.timeToFirstToken !== null && firstTokenTimestamp !== null) {
- metrics.decodingTime = runEndTime - firstTokenTimestamp;
- } else {
- metrics.decodingTime = metrics.inferenceTime;
- }
-
- // Sync tokenizingTime with preprocessingTime for Glean metrics consistency
- metrics.tokenizingTime = metrics.preprocessingTime;
-
- // Calculate throughput metrics if we have the necessary data
- if (metrics.inferenceTime > 0 && metrics.outputTokens > 0) {
- metrics.tokensPerSecond =
- metrics.outputTokens / (metrics.inferenceTime / 1000);
- metrics.timePerOutputToken =
- metrics.inferenceTime / metrics.outputTokens;
- }
- } catch (e) {
- lazy.console.debug("Error computing throughput metrics", e);
}
- result.metrics = metrics;
+ await this.#metricsSnapShot({ name: "runEnd" });
+ result.metrics = this.#metrics;
if (streamer) {
inferenceProgressCallback?.({
diff --git a/toolkit/components/ml/metrics.yaml b/toolkit/components/ml/metrics.yaml
@@ -181,9 +181,6 @@ firefox.ai.runtime:
engineId:
description: Engine id
type: string
- error:
- description: error class or message
- type: string
run_inference_success_flow:
type: event
@@ -203,35 +200,11 @@ firefox.ai.runtime:
tokenizing_time:
type: quantity
description: >
- Time taken for tokenization (preprocessing) in milliseconds
+ Time taken for tokenization in milliseconds
inference_time:
type: quantity
description: >
Time taken for inference in milliseconds
- decoding_time:
- type: quantity
- description: >
- Time taken for decoding in milliseconds
- input_tokens:
- type: quantity
- description: >
- Number of tokens in the input prompt
- output_tokens:
- type: quantity
- description: >
- Number of tokens generated
- time_to_first_token:
- type: quantity
- description: >
- Time to first token in milliseconds (null if not applicable)
- tokens_per_second:
- type: quantity
- description: >
- Inference throughput in tokens per second (rounded integer)
- time_per_output_token:
- type: quantity
- description: >
- Average latency per output token in milliseconds (rounded integer)
run_inference_success:
type: labeled_timing_distribution
diff --git a/toolkit/components/ml/tests/browser/browser_ml_telemetry.js b/toolkit/components/ml/tests/browser/browser_ml_telemetry.js
@@ -58,52 +58,11 @@ add_task(async function test_default_telemetry() {
);
{
- info("Test the run_inference_success_flow event");
- const inferenceFlowEvents =
- Glean.firefoxAiRuntime.runInferenceSuccessFlow.testGetValue();
- Assert.ok(
- inferenceFlowEvents && !!inferenceFlowEvents.length,
- "At least one run_inference_success_flow event was recorded"
- );
- const lastInferenceEvent = inferenceFlowEvents.at(-1);
- const { extra: inferenceExtra } = lastInferenceEvent;
-
- // Helper to check that a number field is present and >= 0
- const checkNumber = (key, isOptional = false) => {
- const value = inferenceExtra[key];
- if (isOptional && (value === null || value === undefined)) {
- return; // Optional field not present is OK
- }
- Assert.notEqual(value, null, `${key} should be present`);
- const number = Number(value); // Quantities are stored as strings
- Assert.ok(!Number.isNaN(number), `${key} should be a number`);
- Assert.greaterOrEqual(number, 0, `${key} should be >= 0`);
- };
-
- // Check flow_id is present
- Assert.ok(inferenceExtra.flow_id, "flow_id should be present");
-
- // Check all required timing/token metrics
- checkNumber("tokenizing_time", true);
- checkNumber("inference_time", true);
- checkNumber("decoding_time", true);
- checkNumber("input_tokens", true);
- checkNumber("output_tokens", true);
- checkNumber("time_to_first_token", true);
- checkNumber("tokens_per_second", true);
- checkNumber("time_per_output_token", true);
- }
-
- {
info("Test the engine_run event");
await engineInstance.lastResourceRequest;
const value = Glean.firefoxAiRuntime.engineRun.testGetValue();
- Assert.ok(
- value && !!value.length,
- "At least one engine_run event was recorded"
- );
- const lastEngineRunEvent = value.at(-1);
- const { extra } = lastEngineRunEvent;
+ Assert.equal(value?.length, 1, "One engine_run event was recorded");
+ const [{ extra }] = value;
const checkNumber = key => {
const value = extra[key];
Assert.notEqual(value, null, `${key} should be present`);