tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

test_utf.js (6229B)


      1 // NOTE: Requires testharness.js
      2 // http://www.w3.org/2008/webapps/wiki/Harness
      3 
      4 // Extension to testharness.js API which avoids logging enormous strings
      5 // on a coding failure.
      6 function assert_string_equals(actual, expected, description) {
      7  // short circuit success case
      8  if (actual === expected) {
      9    assert_true(true, description + ": <actual> === <expected>");
     10    return;
     11  }
     12 
     13  // length check
     14  assert_equals(
     15    actual.length,
     16    expected.length,
     17    description + ": string lengths"
     18  );
     19 
     20  var i, a, b;
     21  for (i = 0; i < actual.length; i++) {
     22    a = actual.charCodeAt(i);
     23    b = expected.charCodeAt(i);
     24    if (a !== b) {
     25      assert_true(
     26        false,
     27        description +
     28          ": code unit " +
     29          i.toString() +
     30          " unequal: " +
     31          cpname(a) +
     32          " != " +
     33          cpname(b)
     34      );
     35    } // doesn't return
     36  }
     37 
     38  // It should be impossible to get here, because the initial
     39  // comparison failed, so either the length comparison or the
     40  // codeunit-by-codeunit comparison should also fail.
     41  assert_true(false, description + ": failed to detect string difference");
     42 }
     43 
     44 // Inspired by:
     45 // http://ecmanaut.blogspot.com/2006/07/encoding-decoding-utf8-in-javascript.html
     46 function encode_utf8(string) {
     47  var utf8 = unescape(encodeURIComponent(string));
     48  var octets = new Uint8Array(utf8.length),
     49    i;
     50  for (i = 0; i < utf8.length; i += 1) {
     51    octets[i] = utf8.charCodeAt(i);
     52  }
     53  return octets;
     54 }
     55 
     56 function encode_utf16le(string) {
     57  var octets = new Uint8Array(string.length * 2);
     58  var di = 0;
     59  for (var i = 0; i < string.length; i++) {
     60    var code = string.charCodeAt(i);
     61    octets[di++] = code & 0xff;
     62    octets[di++] = code >> 8;
     63  }
     64  return octets;
     65 }
     66 
     67 function encode_utf16be(string) {
     68  var octets = new Uint8Array(string.length * 2);
     69  var di = 0;
     70  for (var i = 0; i < string.length; i++) {
     71    var code = string.charCodeAt(i);
     72    octets[di++] = code >> 8;
     73    octets[di++] = code & 0xff;
     74  }
     75  return octets;
     76 }
     77 
     78 function decode_utf8(octets) {
     79  var utf8 = String.fromCharCode.apply(null, octets);
     80  return decodeURIComponent(escape(utf8));
     81 }
     82 
     83 // Helpers for test_utf_roundtrip.
     84 function cpname(n) {
     85  if (n + 0 !== n) {
     86    return n.toString();
     87  }
     88  var w = n <= 0xffff ? 4 : 6;
     89  return "U+" + ("000000" + n.toString(16).toUpperCase()).slice(-w);
     90 }
     91 
     92 function genblock(from, len) {
     93  var i, j, point, offset;
     94  var size, block;
     95 
     96  // determine size required:
     97  //    1 unit   for each point from U+000000 through U+00D7FF
     98  //    0 units                      U+00D800 through U+00DFFF
     99  //    1 unit                       U+00E000 through U+00FFFF
    100  //    2 units                      U+010000 through U+10FFFF
    101  function overlap(min1, max1, min2, max2) {
    102    return Math.max(0, Math.min(max1, max2) - Math.max(min1, min2));
    103  }
    104  size =
    105    overlap(from, from + len, 0x000000, 0x00d800) +
    106    overlap(from, from + len, 0x00e000, 0x010000) +
    107    overlap(from, from + len, 0x010000, 0x110000) * 2;
    108 
    109  block = new Uint16Array(size);
    110  for (i = 0, j = 0; i < len; i++) {
    111    point = from + i;
    112    if (0xd800 <= point && point <= 0xdfff) {
    113      continue;
    114    } else if (point <= 0xffff) {
    115      block[j++] = point;
    116    } else {
    117      offset = point - 0x10000;
    118      block[j++] = 0xd800 + (offset >> 10);
    119      block[j++] = 0xdc00 + (offset & 0x3ff);
    120    }
    121  }
    122  return String.fromCharCode.apply(null, block);
    123 }
    124 
    125 function test_utf_roundtrip() {
    126  var MIN_CODEPOINT = 0;
    127  var MAX_CODEPOINT = 0x10ffff;
    128  var BLOCK_SIZE = 0x1000;
    129 
    130  var block, block_tag, i, encoded, decoded, exp_encoded, exp_decoded;
    131 
    132  var TD_U16LE = new TextDecoder("UTF-16LE");
    133 
    134  var TD_U16BE = new TextDecoder("UTF-16BE");
    135 
    136  var TE_U8 = new TextEncoder();
    137  var TD_U8 = new TextDecoder("UTF-8");
    138 
    139  for (i = MIN_CODEPOINT; i < MAX_CODEPOINT; i += BLOCK_SIZE) {
    140    block_tag = cpname(i) + " - " + cpname(i + BLOCK_SIZE - 1);
    141    block = genblock(i, BLOCK_SIZE);
    142 
    143    // test UTF-16LE, UTF-16BE, and UTF-8 encodings against themselves
    144    encoded = encode_utf16le(block);
    145    decoded = TD_U16LE.decode(encoded);
    146    assert_string_equals(block, decoded, "UTF-16LE round trip " + block_tag);
    147 
    148    encoded = encode_utf16be(block);
    149    decoded = TD_U16BE.decode(encoded);
    150    assert_string_equals(block, decoded, "UTF-16BE round trip " + block_tag);
    151 
    152    encoded = TE_U8.encode(block);
    153    decoded = TD_U8.decode(encoded);
    154    assert_string_equals(block, decoded, "UTF-8 round trip " + block_tag);
    155 
    156    // test TextEncoder(UTF-8) against the older idiom
    157    exp_encoded = encode_utf8(block);
    158    assert_array_equals(
    159      encoded,
    160      exp_encoded,
    161      "UTF-8 reference encoding " + block_tag
    162    );
    163 
    164    exp_decoded = decode_utf8(exp_encoded);
    165    assert_string_equals(
    166      decoded,
    167      exp_decoded,
    168      "UTF-8 reference decoding " + block_tag
    169    );
    170  }
    171 }
    172 
    173 function test_utf_samples() {
    174  // z, cent, CJK water, G-Clef, Private-use character
    175  var sample = "z\xA2\u6C34\uD834\uDD1E\uDBFF\uDFFD";
    176  var cases = [
    177    {
    178      encoding: "utf-8",
    179      expected: [
    180        0x7a, 0xc2, 0xa2, 0xe6, 0xb0, 0xb4, 0xf0, 0x9d, 0x84, 0x9e, 0xf4, 0x8f,
    181        0xbf, 0xbd,
    182      ],
    183    },
    184    {
    185      encoding: "utf-16le",
    186      expected: [
    187        0x7a, 0x00, 0xa2, 0x00, 0x34, 0x6c, 0x34, 0xd8, 0x1e, 0xdd, 0xff, 0xdb,
    188        0xfd, 0xdf,
    189      ],
    190    },
    191    {
    192      encoding: "utf-16",
    193      expected: [
    194        0x7a, 0x00, 0xa2, 0x00, 0x34, 0x6c, 0x34, 0xd8, 0x1e, 0xdd, 0xff, 0xdb,
    195        0xfd, 0xdf,
    196      ],
    197    },
    198    {
    199      encoding: "utf-16be",
    200      expected: [
    201        0x00, 0x7a, 0x00, 0xa2, 0x6c, 0x34, 0xd8, 0x34, 0xdd, 0x1e, 0xdb, 0xff,
    202        0xdf, 0xfd,
    203      ],
    204    },
    205  ];
    206 
    207  var encoded = new TextEncoder().encode(sample);
    208  assert_array_equals(encoded, cases[0].expected, "expected equal encodings");
    209 
    210  cases.forEach(function (t) {
    211    var decoded = new TextDecoder(t.encoding).decode(
    212      new Uint8Array(t.expected)
    213    );
    214    assert_equals(decoded, sample, "expected equal decodings - " + t.encoding);
    215  });
    216 }
    217 
    218 test(
    219  test_utf_samples,
    220  "UTF-8, UTF-16LE, UTF-16BE - Encode/Decode - reference sample"
    221 );
    222 
    223 test(
    224  test_utf_roundtrip,
    225  "UTF-8, UTF-16LE, UTF-16BE - Encode/Decode - full roundtrip and " +
    226    "agreement with encode/decodeURIComponent"
    227 );