[ tor-browser ].git.dasho

commit a49b09a1fbd3e7495c953880b792167f89c3a5a7
parent 63a518731e56d809573217016ed0e0fc8a2f9732
Author: Greg Tatum <tatum.creative@gmail.com>
Date:   Thu,  2 Oct 2025 19:35:07 +0000

Bug 1990084 - Add some numpy npy parsing utilities; r=tarek

Differential Revision: https://phabricator.services.mozilla.com/D266426

Diffstat:
M toolkit/components/ml/content/Utils.sys.mjs  | 154 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M toolkit/components/ml/tests/browser/browser_ml_utils.js  | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M toolkit/components/ml/tests/browser/head.js  | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++

3 files changed, 268 insertions(+), 0 deletions(-)
diff --git a/toolkit/components/ml/content/Utils.sys.mjs b/toolkit/components/ml/content/Utils.sys.mjs
@@ -1462,3 +1462,157 @@ export function stringifyForLog(
   }
   return s;
 }
+
+/**
+ * Reads into an ArrayBuffer keeping track of the offsets.
+ */
+class ByteReader {
+  /**
+   * @param {ArrayBuffer} buffer
+   */
+  constructor(buffer) {
+    this.offset = 0;
+    this.buffer = buffer;
+    this.view = new DataView(buffer);
+  }
+
+  /**
+   * @returns {number}
+   */
+  uint8() {
+    return this.view.getUint8(this.offset++);
+  }
+
+  /**
+   * @param {"little" | "big"} endianess
+   */
+  uint16(endianess) {
+    const value = this.view.getUint16(this.offset, endianess == "little");
+    this.offset += 2;
+    return value;
+  }
+
+  /**
+   * @param {number} length
+   * @returns {string}
+   */
+  latin1(length) {
+    const bytes = new Uint8Array(this.buffer, this.offset, length);
+    this.offset += length;
+    const decoder = new TextDecoder("latin1");
+    return decoder.decode(bytes);
+  }
+
+  /**
+   * Return the remaining data.
+   */
+  sliceRemaining() {
+    return this.buffer.slice(this.offset);
+  }
+}
+
+/**
+ * Parse an ArrayBuffer of a .npy file into a typed array and shape.
+ *
+ * https://numpy.org/doc/stable/reference/generated/numpy.lib.format.html
+ *
+ * @param {ArrayBuffer} buffer The ArrayBuffer containing the .npy data.
+ * @returns {{data: TypedArray, shape: number[], dtype: string}}
+ */
+export function parseNpy(buffer) {
+  const reader = new ByteReader(buffer);
+  if (reader.uint8() != 0x93 || reader.latin1(5) != "NUMPY") {
+    throw new Error("Not a valid .npy file");
+  }
+  const majorVersion = reader.uint8();
+  reader.uint8(); // minorVersion
+
+  if (majorVersion != 1) {
+    throw new Error("Only major version 1 is currently supported.");
+  }
+
+  const headerLength = reader.uint16("little");
+  let headerText = reader.latin1(headerLength).trim();
+
+  // Header is a Python dict string. Do some text manipulation to make it JSON parseable.
+  //
+  //  "{'descr': '<f8', 'fortran_order': False, 'shape': (3, 4), }"
+  //  "{'descr': '|u1', 'fortran_order': False, 'shape': (63091, 128), }"
+  headerText = headerText
+    .replace(/'/g, '"') // single to double quotes
+    .replace("False", "false")
+    .replace("True", "true")
+    .replace(/,\s*}/, "}") // trailing commas
+    .replace(/,\s*\)/, ")"); // trailing commas in tuple
+
+  const header = JSON.parse(
+    headerText.replace(/\((.*?)\)/, (m, inner) => {
+      // convert shape tuple into JSON array
+      return `[${inner.trim().replace(/, /g, ",")}]`;
+    })
+  );
+
+  if (header.fortran_order) {
+    throw new Error("Unable to parse an array using fortran_order");
+  }
+
+  const fullType = header.descr; // e.g. '<f8'
+  const littleEndian = fullType[0] === "<" || fullType[0] === "|";
+  const dtype = fullType.slice(1);
+
+  const shape = header.shape;
+  const dataBuffer = reader.sliceRemaining();
+
+  let typedArray;
+  switch (dtype) {
+    case "f8": // float64
+      typedArray = new Float64Array(dataBuffer);
+      break;
+    case "f4": // float32
+      typedArray = new Float32Array(dataBuffer);
+      break;
+    case "f2": // float16
+      typedArray = new Float16Array(dataBuffer);
+      break;
+    case "i4": // int32
+      typedArray = new Int32Array(dataBuffer);
+      break;
+    case "i2": // int16
+      typedArray = new Int16Array(dataBuffer);
+      break;
+    case "i1": // int8
+      typedArray = new Int8Array(dataBuffer);
+      break;
+    case "u4": // uint32
+      typedArray = new Uint32Array(dataBuffer);
+      break;
+    case "u2": // uint16
+      typedArray = new Uint16Array(dataBuffer);
+      break;
+    case "u1": // uint8
+      typedArray = new Uint8Array(dataBuffer);
+      break;
+    default:
+      throw new Error(`Unsupported dtype: ${fullType}`);
+  }
+
+  let expectedLength = 1;
+  for (const size of shape) {
+    expectedLength *= size;
+  }
+  if (typedArray.length != expectedLength) {
+    throw new Error(
+      `The data length (${typedArray.length}) did not match the expected dimensions (${expectedLength}) for shape ${JSON.stringify(shape)}`
+    );
+  }
+
+  // If endianness doesn't match, swap the bytes.
+  if (!littleEndian && typedArray.BYTES_PER_ELEMENT > 1) {
+    const u8 = new Uint8Array(typedArray.buffer);
+    for (let i = 0; i < u8.length; i += typedArray.BYTES_PER_ELEMENT) {
+      u8.subarray(i, i + typedArray.BYTES_PER_ELEMENT).reverse();
+    }
+  }
+
+  return { data: typedArray, shape, dtype };
+}
diff --git a/toolkit/components/ml/tests/browser/browser_ml_utils.js b/toolkit/components/ml/tests/browser/browser_ml_utils.js
@@ -15,6 +15,7 @@ const {
   addonIdToEngineId,
   engineIdToAddonId,
   stringifyForLog,
+  parseNpy,
 } = ChromeUtils.importESModule("chrome://global/content/ml/Utils.sys.mjs");
 
 /**
@@ -1330,3 +1331,62 @@ add_task(function test_stringifyForLog_top_level_bigint() {
     "Top-level BigInt should stringify (quoted or not)"
   );
 });
+
+add_task(function test_npy_parsing() {
+  // # Generate some npy arrays with python:
+  // import numpy as np
+  // import io
+  // fib5 = [0, 1, 1, 2, 3]
+  // fib5_u8 = np.array(fib5, dtype=np.uint8)
+  // fib5_f16 = np.array(fib5, dtype=np.float16)
+  // fib5_f32 = np.array(fib5, dtype=np.float32)
+  // def to_npy_uint8array(arr: np.ndarray) -> str:
+  //     buf = io.BytesIO()
+  //     np.save(buf, arr)
+  //     b = buf.getvalue()
+  //     return "new Uint8Array([" + ", ".join(str(x) for x in b) + "])"
+  // npy_u8_5 = to_npy_uint8array(fib5_u8)
+  // npy_f16_5 = to_npy_uint8array(fib5_f16)
+  // npy_f32_5 = to_npy_uint8array(fib5_f32)
+  // npy_u8_5, npy_f16_5, npy_f32_5
+
+  // prettier-ignore
+  const u8 = new Uint8Array([147, 78, 85, 77, 80, 89, 1, 0, 118, 0, 123, 39, 100, 101, 115, 99, 114, 39, 58, 32, 39, 124, 117, 49, 39, 44, 32, 39, 102, 111, 114, 116, 114, 97, 110, 95, 111, 114, 100, 101, 114, 39, 58, 32, 70, 97, 108, 115, 101, 44, 32, 39, 115, 104, 97, 112, 101, 39, 58, 32, 40, 53, 44, 41, 44, 32, 125, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 10, 0, 1, 1, 2, 3])
+  // prettier-ignore
+  const fp16 = new Uint8Array([147, 78, 85, 77, 80, 89, 1, 0, 118, 0, 123, 39, 100, 101, 115, 99, 114, 39, 58, 32, 39, 60, 102, 50, 39, 44, 32, 39, 102, 111, 114, 116, 114, 97, 110, 95, 111, 114, 100, 101, 114, 39, 58, 32, 70, 97, 108, 115, 101, 44, 32, 39, 115, 104, 97, 112, 101, 39, 58, 32, 40, 53, 44, 41, 44, 32, 125, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 10, 0, 0, 0, 60, 0, 60, 0, 64, 0, 66])
+  // prettier-ignore
+  const fp32 = new Uint8Array([147, 78, 85, 77, 80, 89, 1, 0, 118, 0, 123, 39, 100, 101, 115, 99, 114, 39, 58, 32, 39, 60, 102, 52, 39, 44, 32, 39, 102, 111, 114, 116, 114, 97, 110, 95, 111, 114, 100, 101, 114, 39, 58, 32, 70, 97, 108, 115, 101, 44, 32, 39, 115, 104, 97, 112, 101, 39, 58, 32, 40, 53, 44, 41, 44, 32, 125, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 10, 0, 0, 0, 0, 0, 0, 128, 63, 0, 0, 128, 63, 0, 0, 0, 64, 0, 0, 64, 64])
+
+  const everyoneLovesSprintPlanning = [0, 1, 1, 2, 3];
+
+  const testCases = [
+    { name: "u8", npy: u8, expectedDtype: "u1" },
+    { name: "fp16", npy: fp16, expectedDtype: "f2" },
+    { name: "fp32", npy: fp32, expectedDtype: "f4" },
+  ];
+
+  for (const { name, npy, expectedDtype } of testCases) {
+    const { data, shape, dtype } = parseNpy(npy.buffer);
+    SimpleTest.isDeeply(
+      data,
+      everyoneLovesSprintPlanning,
+      `${name} encoding matches`
+    );
+    SimpleTest.isDeeply(shape, [5], `${name} shape matches`);
+    SimpleTest.isDeeply(dtype, expectedDtype, `${name} shape matches`);
+  }
+});
+
+/**
+ * Check that round tripping works for the numpy parsing, and the generation from
+ * test fixtures.
+ */
+add_task(function test_npy_fixture() {
+  const vocabSize = 3;
+  const dimensions = 4;
+  const { numbers, encoding } = generateFloat16Numpy(vocabSize, dimensions);
+  const { data, shape, dtype } = parseNpy(encoding.buffer);
+  SimpleTest.isDeeply(numbers, data, "Round tripping produces the same array");
+  SimpleTest.isDeeply(shape, [vocabSize, dimensions], "The shape is preserved");
+  is(dtype, "f2", "The datatype is correctly fp16");
+});
diff --git a/toolkit/components/ml/tests/browser/head.js b/toolkit/components/ml/tests/browser/head.js
@@ -800,3 +800,57 @@ function startMockOpenAI({ echo = "This gets echoed." } = {}) {
 function stopMockOpenAI(server) {
   return new Promise(resolve => server.stop(resolve));
 }
+
+/**
+ * Generates a numpy encoded float16 array to be used for generating static embeddings
+ * test data.
+ *
+ * @param {number} vocabSize
+ * @param {number} dimensions
+ * @returns {{ numbers: Float16Array, encoding: Uint8Array }}
+ */
+function generateFloat16Numpy(vocabSize, dimensions) {
+  const numbers = new Float16Array(vocabSize * dimensions);
+  // Build the data:
+  // [0.1, 0.2, 0.3, ..., 0.1 * vocabSize * dimensions]
+  for (let i = 0; i < vocabSize; i++) {
+    for (let j = 0; j < dimensions; j++) {
+      const index = i * dimensions + j;
+      numbers[index] = index / 10;
+    }
+  }
+  const magic = new Uint8Array([0x93, 78, 85, 77, 80, 89]); // \x93NUMPY
+  const version = new Uint8Array([1, 0]); // Version 1.0
+  let header = `{'descr': '<f2', 'fortran_order': False, 'shape': (${vocabSize},${dimensions}), }`;
+
+  // Pad header to 16-byte alignment
+  const preLength = magic.length + version.length + 2; // +2 for header length field
+  let padding = 16 - ((preLength + header.length + 1) % 16);
+  if (padding === 16) {
+    padding = 0;
+  }
+  header += " ".repeat(padding) + "\n";
+
+  const headerBytes = new TextEncoder().encode(header);
+
+  const headerLen = new Uint8Array(2);
+  new DataView(headerLen.buffer).setUint16(0, headerBytes.length, true);
+
+  const encoding = new Uint8Array(
+    preLength + headerBytes.length + numbers.byteLength
+  );
+
+  // Write everything out.
+  let offset = 0;
+  encoding.set(magic, offset);
+  offset += magic.length;
+  encoding.set(version, offset);
+  offset += version.length;
+  encoding.set(headerLen, offset);
+  offset += 2;
+  encoding.set(headerBytes, offset);
+  offset += headerBytes.length;
+  encoding.set(new Uint8Array(numbers.buffer), offset);
+
+  return { numbers, encoding };
+}

	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE

M	toolkit/components/ml/content/Utils.sys.mjs	\|	154	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	toolkit/components/ml/tests/browser/browser_ml_utils.js	\|	60	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	toolkit/components/ml/tests/browser/head.js	\|	54	++++++++++++++++++++++++++++++++++++++++++++++++++++++