tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

commit a49b09a1fbd3e7495c953880b792167f89c3a5a7
parent 63a518731e56d809573217016ed0e0fc8a2f9732
Author: Greg Tatum <tatum.creative@gmail.com>
Date:   Thu,  2 Oct 2025 19:35:07 +0000

Bug 1990084 - Add some numpy npy parsing utilities; r=tarek

Differential Revision: https://phabricator.services.mozilla.com/D266426

Diffstat:
Mtoolkit/components/ml/content/Utils.sys.mjs | 154+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtoolkit/components/ml/tests/browser/browser_ml_utils.js | 60++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtoolkit/components/ml/tests/browser/head.js | 54++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 268 insertions(+), 0 deletions(-)

diff --git a/toolkit/components/ml/content/Utils.sys.mjs b/toolkit/components/ml/content/Utils.sys.mjs @@ -1462,3 +1462,157 @@ export function stringifyForLog( } return s; } + +/** + * Reads into an ArrayBuffer keeping track of the offsets. + */ +class ByteReader { + /** + * @param {ArrayBuffer} buffer + */ + constructor(buffer) { + this.offset = 0; + this.buffer = buffer; + this.view = new DataView(buffer); + } + + /** + * @returns {number} + */ + uint8() { + return this.view.getUint8(this.offset++); + } + + /** + * @param {"little" | "big"} endianess + */ + uint16(endianess) { + const value = this.view.getUint16(this.offset, endianess == "little"); + this.offset += 2; + return value; + } + + /** + * @param {number} length + * @returns {string} + */ + latin1(length) { + const bytes = new Uint8Array(this.buffer, this.offset, length); + this.offset += length; + const decoder = new TextDecoder("latin1"); + return decoder.decode(bytes); + } + + /** + * Return the remaining data. + */ + sliceRemaining() { + return this.buffer.slice(this.offset); + } +} + +/** + * Parse an ArrayBuffer of a .npy file into a typed array and shape. + * + * https://numpy.org/doc/stable/reference/generated/numpy.lib.format.html + * + * @param {ArrayBuffer} buffer The ArrayBuffer containing the .npy data. + * @returns {{data: TypedArray, shape: number[], dtype: string}} + */ +export function parseNpy(buffer) { + const reader = new ByteReader(buffer); + if (reader.uint8() != 0x93 || reader.latin1(5) != "NUMPY") { + throw new Error("Not a valid .npy file"); + } + const majorVersion = reader.uint8(); + reader.uint8(); // minorVersion + + if (majorVersion != 1) { + throw new Error("Only major version 1 is currently supported."); + } + + const headerLength = reader.uint16("little"); + let headerText = reader.latin1(headerLength).trim(); + + // Header is a Python dict string. Do some text manipulation to make it JSON parseable. + // + // "{'descr': '<f8', 'fortran_order': False, 'shape': (3, 4), }" + // "{'descr': '|u1', 'fortran_order': False, 'shape': (63091, 128), }" + headerText = headerText + .replace(/'/g, '"') // single to double quotes + .replace("False", "false") + .replace("True", "true") + .replace(/,\s*}/, "}") // trailing commas + .replace(/,\s*\)/, ")"); // trailing commas in tuple + + const header = JSON.parse( + headerText.replace(/\((.*?)\)/, (m, inner) => { + // convert shape tuple into JSON array + return `[${inner.trim().replace(/, /g, ",")}]`; + }) + ); + + if (header.fortran_order) { + throw new Error("Unable to parse an array using fortran_order"); + } + + const fullType = header.descr; // e.g. '<f8' + const littleEndian = fullType[0] === "<" || fullType[0] === "|"; + const dtype = fullType.slice(1); + + const shape = header.shape; + const dataBuffer = reader.sliceRemaining(); + + let typedArray; + switch (dtype) { + case "f8": // float64 + typedArray = new Float64Array(dataBuffer); + break; + case "f4": // float32 + typedArray = new Float32Array(dataBuffer); + break; + case "f2": // float16 + typedArray = new Float16Array(dataBuffer); + break; + case "i4": // int32 + typedArray = new Int32Array(dataBuffer); + break; + case "i2": // int16 + typedArray = new Int16Array(dataBuffer); + break; + case "i1": // int8 + typedArray = new Int8Array(dataBuffer); + break; + case "u4": // uint32 + typedArray = new Uint32Array(dataBuffer); + break; + case "u2": // uint16 + typedArray = new Uint16Array(dataBuffer); + break; + case "u1": // uint8 + typedArray = new Uint8Array(dataBuffer); + break; + default: + throw new Error(`Unsupported dtype: ${fullType}`); + } + + let expectedLength = 1; + for (const size of shape) { + expectedLength *= size; + } + if (typedArray.length != expectedLength) { + throw new Error( + `The data length (${typedArray.length}) did not match the expected dimensions (${expectedLength}) for shape ${JSON.stringify(shape)}` + ); + } + + // If endianness doesn't match, swap the bytes. + if (!littleEndian && typedArray.BYTES_PER_ELEMENT > 1) { + const u8 = new Uint8Array(typedArray.buffer); + for (let i = 0; i < u8.length; i += typedArray.BYTES_PER_ELEMENT) { + u8.subarray(i, i + typedArray.BYTES_PER_ELEMENT).reverse(); + } + } + + return { data: typedArray, shape, dtype }; +} diff --git a/toolkit/components/ml/tests/browser/browser_ml_utils.js b/toolkit/components/ml/tests/browser/browser_ml_utils.js @@ -15,6 +15,7 @@ const { addonIdToEngineId, engineIdToAddonId, stringifyForLog, + parseNpy, } = ChromeUtils.importESModule("chrome://global/content/ml/Utils.sys.mjs"); /** @@ -1330,3 +1331,62 @@ add_task(function test_stringifyForLog_top_level_bigint() { "Top-level BigInt should stringify (quoted or not)" ); }); + +add_task(function test_npy_parsing() { + // # Generate some npy arrays with python: + // import numpy as np + // import io + // fib5 = [0, 1, 1, 2, 3] + // fib5_u8 = np.array(fib5, dtype=np.uint8) + // fib5_f16 = np.array(fib5, dtype=np.float16) + // fib5_f32 = np.array(fib5, dtype=np.float32) + // def to_npy_uint8array(arr: np.ndarray) -> str: + // buf = io.BytesIO() + // np.save(buf, arr) + // b = buf.getvalue() + // return "new Uint8Array([" + ", ".join(str(x) for x in b) + "])" + // npy_u8_5 = to_npy_uint8array(fib5_u8) + // npy_f16_5 = to_npy_uint8array(fib5_f16) + // npy_f32_5 = to_npy_uint8array(fib5_f32) + // npy_u8_5, npy_f16_5, npy_f32_5 + + // prettier-ignore + const u8 = new Uint8Array([147, 78, 85, 77, 80, 89, 1, 0, 118, 0, 123, 39, 100, 101, 115, 99, 114, 39, 58, 32, 39, 124, 117, 49, 39, 44, 32, 39, 102, 111, 114, 116, 114, 97, 110, 95, 111, 114, 100, 101, 114, 39, 58, 32, 70, 97, 108, 115, 101, 44, 32, 39, 115, 104, 97, 112, 101, 39, 58, 32, 40, 53, 44, 41, 44, 32, 125, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 10, 0, 1, 1, 2, 3]) + // prettier-ignore + const fp16 = new Uint8Array([147, 78, 85, 77, 80, 89, 1, 0, 118, 0, 123, 39, 100, 101, 115, 99, 114, 39, 58, 32, 39, 60, 102, 50, 39, 44, 32, 39, 102, 111, 114, 116, 114, 97, 110, 95, 111, 114, 100, 101, 114, 39, 58, 32, 70, 97, 108, 115, 101, 44, 32, 39, 115, 104, 97, 112, 101, 39, 58, 32, 40, 53, 44, 41, 44, 32, 125, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 10, 0, 0, 0, 60, 0, 60, 0, 64, 0, 66]) + // prettier-ignore + const fp32 = new Uint8Array([147, 78, 85, 77, 80, 89, 1, 0, 118, 0, 123, 39, 100, 101, 115, 99, 114, 39, 58, 32, 39, 60, 102, 52, 39, 44, 32, 39, 102, 111, 114, 116, 114, 97, 110, 95, 111, 114, 100, 101, 114, 39, 58, 32, 70, 97, 108, 115, 101, 44, 32, 39, 115, 104, 97, 112, 101, 39, 58, 32, 40, 53, 44, 41, 44, 32, 125, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 10, 0, 0, 0, 0, 0, 0, 128, 63, 0, 0, 128, 63, 0, 0, 0, 64, 0, 0, 64, 64]) + + const everyoneLovesSprintPlanning = [0, 1, 1, 2, 3]; + + const testCases = [ + { name: "u8", npy: u8, expectedDtype: "u1" }, + { name: "fp16", npy: fp16, expectedDtype: "f2" }, + { name: "fp32", npy: fp32, expectedDtype: "f4" }, + ]; + + for (const { name, npy, expectedDtype } of testCases) { + const { data, shape, dtype } = parseNpy(npy.buffer); + SimpleTest.isDeeply( + data, + everyoneLovesSprintPlanning, + `${name} encoding matches` + ); + SimpleTest.isDeeply(shape, [5], `${name} shape matches`); + SimpleTest.isDeeply(dtype, expectedDtype, `${name} shape matches`); + } +}); + +/** + * Check that round tripping works for the numpy parsing, and the generation from + * test fixtures. + */ +add_task(function test_npy_fixture() { + const vocabSize = 3; + const dimensions = 4; + const { numbers, encoding } = generateFloat16Numpy(vocabSize, dimensions); + const { data, shape, dtype } = parseNpy(encoding.buffer); + SimpleTest.isDeeply(numbers, data, "Round tripping produces the same array"); + SimpleTest.isDeeply(shape, [vocabSize, dimensions], "The shape is preserved"); + is(dtype, "f2", "The datatype is correctly fp16"); +}); diff --git a/toolkit/components/ml/tests/browser/head.js b/toolkit/components/ml/tests/browser/head.js @@ -800,3 +800,57 @@ function startMockOpenAI({ echo = "This gets echoed." } = {}) { function stopMockOpenAI(server) { return new Promise(resolve => server.stop(resolve)); } + +/** + * Generates a numpy encoded float16 array to be used for generating static embeddings + * test data. + * + * @param {number} vocabSize + * @param {number} dimensions + * @returns {{ numbers: Float16Array, encoding: Uint8Array }} + */ +function generateFloat16Numpy(vocabSize, dimensions) { + const numbers = new Float16Array(vocabSize * dimensions); + // Build the data: + // [0.1, 0.2, 0.3, ..., 0.1 * vocabSize * dimensions] + for (let i = 0; i < vocabSize; i++) { + for (let j = 0; j < dimensions; j++) { + const index = i * dimensions + j; + numbers[index] = index / 10; + } + } + const magic = new Uint8Array([0x93, 78, 85, 77, 80, 89]); // \x93NUMPY + const version = new Uint8Array([1, 0]); // Version 1.0 + let header = `{'descr': '<f2', 'fortran_order': False, 'shape': (${vocabSize},${dimensions}), }`; + + // Pad header to 16-byte alignment + const preLength = magic.length + version.length + 2; // +2 for header length field + let padding = 16 - ((preLength + header.length + 1) % 16); + if (padding === 16) { + padding = 0; + } + header += " ".repeat(padding) + "\n"; + + const headerBytes = new TextEncoder().encode(header); + + const headerLen = new Uint8Array(2); + new DataView(headerLen.buffer).setUint16(0, headerBytes.length, true); + + const encoding = new Uint8Array( + preLength + headerBytes.length + numbers.byteLength + ); + + // Write everything out. + let offset = 0; + encoding.set(magic, offset); + offset += magic.length; + encoding.set(version, offset); + offset += version.length; + encoding.set(headerLen, offset); + offset += 2; + encoding.set(headerBytes, offset); + offset += headerBytes.length; + encoding.set(new Uint8Array(numbers.buffer), offset); + + return { numbers, encoding }; +}