commit a49b09a1fbd3e7495c953880b792167f89c3a5a7
parent 63a518731e56d809573217016ed0e0fc8a2f9732
Author: Greg Tatum <tatum.creative@gmail.com>
Date: Thu, 2 Oct 2025 19:35:07 +0000
Bug 1990084 - Add some numpy npy parsing utilities; r=tarek
Differential Revision: https://phabricator.services.mozilla.com/D266426
Diffstat:
3 files changed, 268 insertions(+), 0 deletions(-)
diff --git a/toolkit/components/ml/content/Utils.sys.mjs b/toolkit/components/ml/content/Utils.sys.mjs
@@ -1462,3 +1462,157 @@ export function stringifyForLog(
}
return s;
}
+
+/**
+ * Reads into an ArrayBuffer keeping track of the offsets.
+ */
+class ByteReader {
+ /**
+ * @param {ArrayBuffer} buffer
+ */
+ constructor(buffer) {
+ this.offset = 0;
+ this.buffer = buffer;
+ this.view = new DataView(buffer);
+ }
+
+ /**
+ * @returns {number}
+ */
+ uint8() {
+ return this.view.getUint8(this.offset++);
+ }
+
+ /**
+ * @param {"little" | "big"} endianess
+ */
+ uint16(endianess) {
+ const value = this.view.getUint16(this.offset, endianess == "little");
+ this.offset += 2;
+ return value;
+ }
+
+ /**
+ * @param {number} length
+ * @returns {string}
+ */
+ latin1(length) {
+ const bytes = new Uint8Array(this.buffer, this.offset, length);
+ this.offset += length;
+ const decoder = new TextDecoder("latin1");
+ return decoder.decode(bytes);
+ }
+
+ /**
+ * Return the remaining data.
+ */
+ sliceRemaining() {
+ return this.buffer.slice(this.offset);
+ }
+}
+
+/**
+ * Parse an ArrayBuffer of a .npy file into a typed array and shape.
+ *
+ * https://numpy.org/doc/stable/reference/generated/numpy.lib.format.html
+ *
+ * @param {ArrayBuffer} buffer The ArrayBuffer containing the .npy data.
+ * @returns {{data: TypedArray, shape: number[], dtype: string}}
+ */
+export function parseNpy(buffer) {
+ const reader = new ByteReader(buffer);
+ if (reader.uint8() != 0x93 || reader.latin1(5) != "NUMPY") {
+ throw new Error("Not a valid .npy file");
+ }
+ const majorVersion = reader.uint8();
+ reader.uint8(); // minorVersion
+
+ if (majorVersion != 1) {
+ throw new Error("Only major version 1 is currently supported.");
+ }
+
+ const headerLength = reader.uint16("little");
+ let headerText = reader.latin1(headerLength).trim();
+
+ // Header is a Python dict string. Do some text manipulation to make it JSON parseable.
+ //
+ // "{'descr': '<f8', 'fortran_order': False, 'shape': (3, 4), }"
+ // "{'descr': '|u1', 'fortran_order': False, 'shape': (63091, 128), }"
+ headerText = headerText
+ .replace(/'/g, '"') // single to double quotes
+ .replace("False", "false")
+ .replace("True", "true")
+ .replace(/,\s*}/, "}") // trailing commas
+ .replace(/,\s*\)/, ")"); // trailing commas in tuple
+
+ const header = JSON.parse(
+ headerText.replace(/\((.*?)\)/, (m, inner) => {
+ // convert shape tuple into JSON array
+ return `[${inner.trim().replace(/, /g, ",")}]`;
+ })
+ );
+
+ if (header.fortran_order) {
+ throw new Error("Unable to parse an array using fortran_order");
+ }
+
+ const fullType = header.descr; // e.g. '<f8'
+ const littleEndian = fullType[0] === "<" || fullType[0] === "|";
+ const dtype = fullType.slice(1);
+
+ const shape = header.shape;
+ const dataBuffer = reader.sliceRemaining();
+
+ let typedArray;
+ switch (dtype) {
+ case "f8": // float64
+ typedArray = new Float64Array(dataBuffer);
+ break;
+ case "f4": // float32
+ typedArray = new Float32Array(dataBuffer);
+ break;
+ case "f2": // float16
+ typedArray = new Float16Array(dataBuffer);
+ break;
+ case "i4": // int32
+ typedArray = new Int32Array(dataBuffer);
+ break;
+ case "i2": // int16
+ typedArray = new Int16Array(dataBuffer);
+ break;
+ case "i1": // int8
+ typedArray = new Int8Array(dataBuffer);
+ break;
+ case "u4": // uint32
+ typedArray = new Uint32Array(dataBuffer);
+ break;
+ case "u2": // uint16
+ typedArray = new Uint16Array(dataBuffer);
+ break;
+ case "u1": // uint8
+ typedArray = new Uint8Array(dataBuffer);
+ break;
+ default:
+ throw new Error(`Unsupported dtype: ${fullType}`);
+ }
+
+ let expectedLength = 1;
+ for (const size of shape) {
+ expectedLength *= size;
+ }
+ if (typedArray.length != expectedLength) {
+ throw new Error(
+ `The data length (${typedArray.length}) did not match the expected dimensions (${expectedLength}) for shape ${JSON.stringify(shape)}`
+ );
+ }
+
+ // If endianness doesn't match, swap the bytes.
+ if (!littleEndian && typedArray.BYTES_PER_ELEMENT > 1) {
+ const u8 = new Uint8Array(typedArray.buffer);
+ for (let i = 0; i < u8.length; i += typedArray.BYTES_PER_ELEMENT) {
+ u8.subarray(i, i + typedArray.BYTES_PER_ELEMENT).reverse();
+ }
+ }
+
+ return { data: typedArray, shape, dtype };
+}
diff --git a/toolkit/components/ml/tests/browser/browser_ml_utils.js b/toolkit/components/ml/tests/browser/browser_ml_utils.js
@@ -15,6 +15,7 @@ const {
addonIdToEngineId,
engineIdToAddonId,
stringifyForLog,
+ parseNpy,
} = ChromeUtils.importESModule("chrome://global/content/ml/Utils.sys.mjs");
/**
@@ -1330,3 +1331,62 @@ add_task(function test_stringifyForLog_top_level_bigint() {
"Top-level BigInt should stringify (quoted or not)"
);
});
+
+add_task(function test_npy_parsing() {
+ // # Generate some npy arrays with python:
+ // import numpy as np
+ // import io
+ // fib5 = [0, 1, 1, 2, 3]
+ // fib5_u8 = np.array(fib5, dtype=np.uint8)
+ // fib5_f16 = np.array(fib5, dtype=np.float16)
+ // fib5_f32 = np.array(fib5, dtype=np.float32)
+ // def to_npy_uint8array(arr: np.ndarray) -> str:
+ // buf = io.BytesIO()
+ // np.save(buf, arr)
+ // b = buf.getvalue()
+ // return "new Uint8Array([" + ", ".join(str(x) for x in b) + "])"
+ // npy_u8_5 = to_npy_uint8array(fib5_u8)
+ // npy_f16_5 = to_npy_uint8array(fib5_f16)
+ // npy_f32_5 = to_npy_uint8array(fib5_f32)
+ // npy_u8_5, npy_f16_5, npy_f32_5
+
+ // prettier-ignore
+ const u8 = new Uint8Array([147, 78, 85, 77, 80, 89, 1, 0, 118, 0, 123, 39, 100, 101, 115, 99, 114, 39, 58, 32, 39, 124, 117, 49, 39, 44, 32, 39, 102, 111, 114, 116, 114, 97, 110, 95, 111, 114, 100, 101, 114, 39, 58, 32, 70, 97, 108, 115, 101, 44, 32, 39, 115, 104, 97, 112, 101, 39, 58, 32, 40, 53, 44, 41, 44, 32, 125, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 10, 0, 1, 1, 2, 3])
+ // prettier-ignore
+ const fp16 = new Uint8Array([147, 78, 85, 77, 80, 89, 1, 0, 118, 0, 123, 39, 100, 101, 115, 99, 114, 39, 58, 32, 39, 60, 102, 50, 39, 44, 32, 39, 102, 111, 114, 116, 114, 97, 110, 95, 111, 114, 100, 101, 114, 39, 58, 32, 70, 97, 108, 115, 101, 44, 32, 39, 115, 104, 97, 112, 101, 39, 58, 32, 40, 53, 44, 41, 44, 32, 125, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 10, 0, 0, 0, 60, 0, 60, 0, 64, 0, 66])
+ // prettier-ignore
+ const fp32 = new Uint8Array([147, 78, 85, 77, 80, 89, 1, 0, 118, 0, 123, 39, 100, 101, 115, 99, 114, 39, 58, 32, 39, 60, 102, 52, 39, 44, 32, 39, 102, 111, 114, 116, 114, 97, 110, 95, 111, 114, 100, 101, 114, 39, 58, 32, 70, 97, 108, 115, 101, 44, 32, 39, 115, 104, 97, 112, 101, 39, 58, 32, 40, 53, 44, 41, 44, 32, 125, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 10, 0, 0, 0, 0, 0, 0, 128, 63, 0, 0, 128, 63, 0, 0, 0, 64, 0, 0, 64, 64])
+
+ const everyoneLovesSprintPlanning = [0, 1, 1, 2, 3];
+
+ const testCases = [
+ { name: "u8", npy: u8, expectedDtype: "u1" },
+ { name: "fp16", npy: fp16, expectedDtype: "f2" },
+ { name: "fp32", npy: fp32, expectedDtype: "f4" },
+ ];
+
+ for (const { name, npy, expectedDtype } of testCases) {
+ const { data, shape, dtype } = parseNpy(npy.buffer);
+ SimpleTest.isDeeply(
+ data,
+ everyoneLovesSprintPlanning,
+ `${name} encoding matches`
+ );
+ SimpleTest.isDeeply(shape, [5], `${name} shape matches`);
+ SimpleTest.isDeeply(dtype, expectedDtype, `${name} shape matches`);
+ }
+});
+
+/**
+ * Check that round tripping works for the numpy parsing, and the generation from
+ * test fixtures.
+ */
+add_task(function test_npy_fixture() {
+ const vocabSize = 3;
+ const dimensions = 4;
+ const { numbers, encoding } = generateFloat16Numpy(vocabSize, dimensions);
+ const { data, shape, dtype } = parseNpy(encoding.buffer);
+ SimpleTest.isDeeply(numbers, data, "Round tripping produces the same array");
+ SimpleTest.isDeeply(shape, [vocabSize, dimensions], "The shape is preserved");
+ is(dtype, "f2", "The datatype is correctly fp16");
+});
diff --git a/toolkit/components/ml/tests/browser/head.js b/toolkit/components/ml/tests/browser/head.js
@@ -800,3 +800,57 @@ function startMockOpenAI({ echo = "This gets echoed." } = {}) {
function stopMockOpenAI(server) {
return new Promise(resolve => server.stop(resolve));
}
+
+/**
+ * Generates a numpy encoded float16 array to be used for generating static embeddings
+ * test data.
+ *
+ * @param {number} vocabSize
+ * @param {number} dimensions
+ * @returns {{ numbers: Float16Array, encoding: Uint8Array }}
+ */
+function generateFloat16Numpy(vocabSize, dimensions) {
+ const numbers = new Float16Array(vocabSize * dimensions);
+ // Build the data:
+ // [0.1, 0.2, 0.3, ..., 0.1 * vocabSize * dimensions]
+ for (let i = 0; i < vocabSize; i++) {
+ for (let j = 0; j < dimensions; j++) {
+ const index = i * dimensions + j;
+ numbers[index] = index / 10;
+ }
+ }
+ const magic = new Uint8Array([0x93, 78, 85, 77, 80, 89]); // \x93NUMPY
+ const version = new Uint8Array([1, 0]); // Version 1.0
+ let header = `{'descr': '<f2', 'fortran_order': False, 'shape': (${vocabSize},${dimensions}), }`;
+
+ // Pad header to 16-byte alignment
+ const preLength = magic.length + version.length + 2; // +2 for header length field
+ let padding = 16 - ((preLength + header.length + 1) % 16);
+ if (padding === 16) {
+ padding = 0;
+ }
+ header += " ".repeat(padding) + "\n";
+
+ const headerBytes = new TextEncoder().encode(header);
+
+ const headerLen = new Uint8Array(2);
+ new DataView(headerLen.buffer).setUint16(0, headerBytes.length, true);
+
+ const encoding = new Uint8Array(
+ preLength + headerBytes.length + numbers.byteLength
+ );
+
+ // Write everything out.
+ let offset = 0;
+ encoding.set(magic, offset);
+ offset += magic.length;
+ encoding.set(version, offset);
+ offset += version.length;
+ encoding.set(headerLen, offset);
+ offset += 2;
+ encoding.set(headerBytes, offset);
+ offset += headerBytes.length;
+ encoding.set(new Uint8Array(numbers.buffer), offset);
+
+ return { numbers, encoding };
+}