tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

test_charset_conversion.js (9232B)


      1 const NS_ERROR_ILLEGAL_VALUE = Cr.NS_ERROR_ILLEGAL_VALUE;
      2 
      3 var BIS, BOS, _Pipe, COS, FIS, _SS, CIS;
      4 
      5 var dataDir;
      6 
      7 function run_test() {
      8  BIS = Components.Constructor(
      9    "@mozilla.org/binaryinputstream;1",
     10    "nsIBinaryInputStream",
     11    "setInputStream"
     12  );
     13  BOS = Components.Constructor(
     14    "@mozilla.org/binaryoutputstream;1",
     15    "nsIBinaryOutputStream",
     16    "setOutputStream"
     17  );
     18  _Pipe = Components.Constructor("@mozilla.org/pipe;1", "nsIPipe", "init");
     19  COS = Components.Constructor(
     20    "@mozilla.org/intl/converter-output-stream;1",
     21    "nsIConverterOutputStream",
     22    "init"
     23  );
     24  FIS = Components.Constructor(
     25    "@mozilla.org/network/file-input-stream;1",
     26    "nsIFileInputStream",
     27    "init"
     28  );
     29  _SS = Components.Constructor(
     30    "@mozilla.org/storagestream;1",
     31    "nsIStorageStream",
     32    "init"
     33  );
     34  CIS = Components.Constructor(
     35    "@mozilla.org/intl/converter-input-stream;1",
     36    "nsIConverterInputStream",
     37    "init"
     38  );
     39 
     40  dataDir = do_get_file("data/");
     41 
     42  test_utf8_1();
     43  test_cross_conversion();
     44 }
     45 
     46 const UNICODE_STRINGS = [
     47  "\u00BD + \u00BE == \u00BD\u00B2 + \u00BC + \u00BE",
     48 
     49  "AZaz09 \u007F " + // U+000000 to U+00007F
     50    "\u0080 \u0398 \u03BB \u0725 " + // U+000080 to U+0007FF
     51    "\u0964 \u0F5F \u20AC \uFFFB", // U+000800 to U+00FFFF
     52 
     53  // there would be strings containing non-BMP code points here, but
     54  // unfortunately JS strings are UCS-2 (and worse yet are treated as
     55  // 16-bit values by the spec), so we have to do gymnastics to work
     56  // with non-BMP -- manual surrogate decoding doesn't work because
     57  // String.prototype.charCodeAt() ignores surrogate pairs and only
     58  // returns 16-bit values
     59 ];
     60 
     61 // test conversion equality -- keys are names of files containing equivalent
     62 // Unicode data, values are the encoding of the file in the format expected by
     63 // nsIConverter(In|Out)putStream.init
     64 const UNICODE_FILES = {
     65  "unicode-conversion.utf8.txt": "UTF-8",
     66  "unicode-conversion.utf16.txt": "UTF-16",
     67  "unicode-conversion.utf16le.txt": "UTF-16LE",
     68  "unicode-conversion.utf16be.txt": "UTF-16BE",
     69 };
     70 
     71 function test_utf8_1() {
     72  for (var i = 0; i < UNICODE_STRINGS.length; i++) {
     73    var pipe = Pipe();
     74    var conv = new COS(pipe.outputStream, "UTF-8");
     75    Assert.ok(conv.writeString(UNICODE_STRINGS[i]));
     76    conv.close();
     77 
     78    if (
     79      !equalStreams(
     80        new UTF8(pipe.inputStream),
     81        stringToCodePoints(UNICODE_STRINGS[i])
     82      )
     83    ) {
     84      do_throw("UNICODE_STRINGS[" + i + "] not handled correctly");
     85    }
     86  }
     87 }
     88 
     89 function test_cross_conversion() {
     90  for (var fn1 in UNICODE_FILES) {
     91    var fin = getBinaryInputStream(fn1);
     92    var ss = StorageStream();
     93 
     94    var bos = new BOS(ss.getOutputStream(0));
     95    var av;
     96    while ((av = fin.available()) > 0) {
     97      var data = fin.readByteArray(av);
     98      bos.writeByteArray(data);
     99    }
    100    fin.close();
    101    bos.close();
    102 
    103    for (var fn2 in UNICODE_FILES) {
    104      var fin2 = getUnicharInputStream(fn2, UNICODE_FILES[fn2]);
    105      var unichar = new CIS(
    106        ss.newInputStream(0),
    107        UNICODE_FILES[fn1],
    108        8192,
    109        0x0
    110      );
    111 
    112      if (!equalUnicharStreams(unichar, fin2)) {
    113        do_throw(
    114          "unequal streams: " + UNICODE_FILES[fn1] + ", " + UNICODE_FILES[fn2]
    115        );
    116      }
    117    }
    118  }
    119 }
    120 
    121 // utility functions
    122 
    123 function StorageStream() {
    124  return new _SS(8192, Math.pow(2, 32) - 1, null);
    125 }
    126 
    127 function getUnicharInputStream(filename, encoding) {
    128  var file = dataDir.clone();
    129  file.append(filename);
    130 
    131  const PR_RDONLY = 0x1;
    132  var fis = new FIS(
    133    file,
    134    PR_RDONLY,
    135    "0644",
    136    Ci.nsIFileInputStream.CLOSE_ON_EOF
    137  );
    138  return new CIS(fis, encoding, 8192, 0x0);
    139 }
    140 
    141 function getBinaryInputStream(filename) {
    142  var file = dataDir.clone();
    143  file.append(filename);
    144 
    145  const PR_RDONLY = 0x1;
    146  var fis = new FIS(
    147    file,
    148    PR_RDONLY,
    149    "0644",
    150    Ci.nsIFileInputStream.CLOSE_ON_EOF
    151  );
    152  return new BIS(fis);
    153 }
    154 
    155 function equalStreams(stream, codePoints) {
    156  var currIndex = 0;
    157  while (true) {
    158    var unit = stream.readUnit();
    159    if (unit < 0) {
    160      return currIndex == codePoints.length;
    161    }
    162    if (unit !== codePoints[currIndex++]) {
    163      return false;
    164    }
    165  }
    166  // eslint-disable-next-line no-unreachable
    167  do_throw("not reached");
    168  return false;
    169 }
    170 
    171 function equalUnicharStreams(s1, s2) {
    172  var r1, r2;
    173  var str1 = {},
    174    str2 = {};
    175  while (true) {
    176    r1 = s1.readString(1024, str1);
    177    r2 = s2.readString(1024, str2);
    178 
    179    if (r1 != r2 || str1.value != str2.value) {
    180      print("r1: " + r1 + ", r2: " + r2);
    181      print(str1.value.length);
    182      print(str2.value.length);
    183      return false;
    184    }
    185    if (r1 == 0 && r2 == 0) {
    186      return true;
    187    }
    188  }
    189 
    190  // not reached
    191  // eslint-disable-next-line no-unreachable
    192  return false;
    193 }
    194 
    195 function stringToCodePoints(str) {
    196  return str.split("").map(function (v) {
    197    return v.charCodeAt(0);
    198  });
    199 }
    200 
    201 function lowbits(n) {
    202  return Math.pow(2, n) - 1;
    203 }
    204 
    205 function Pipe() {
    206  return new _Pipe(false, false, 1024, 10, null);
    207 }
    208 
    209 // complex charset readers
    210 
    211 /**
    212 * Wraps a UTF-8 stream to allow access to the Unicode code points in it.
    213 *
    214 * @param stream
    215 *   the stream to wrap
    216 */
    217 function UTF8(stream) {
    218  this._stream = new BIS(stream);
    219 }
    220 UTF8.prototype = {
    221  // returns numeric code point at front of stream encoded in UTF-8, -1 if at
    222  // end of stream, or throws if valid (and properly encoded!) code point not
    223  // found
    224  readUnit() {
    225    var str = this._stream;
    226 
    227    var c, c2, c3, c4, rv;
    228 
    229    // if at end of stream, must distinguish failure to read any bytes
    230    // (correct behavior) from failure to read some byte after the first
    231    // in the character
    232    try {
    233      c = str.read8();
    234    } catch (e) {
    235      return -1;
    236    }
    237 
    238    if (c < 0x80) {
    239      return c;
    240    }
    241 
    242    if (c < 0xc0) {
    243      // c < 11000000
    244      // byte doesn't have enough leading ones (must be at least two)
    245      throw NS_ERROR_ILLEGAL_VALUE;
    246    }
    247 
    248    c2 = str.read8();
    249    if (c2 >= 0xc0 || c2 < 0x80) {
    250      throw NS_ERROR_ILLEGAL_VALUE;
    251    } // not 10xxxxxx
    252 
    253    if (c < 0xe0) {
    254      // c < 11100000
    255      // two-byte between U+000080 and U+0007FF
    256      rv = ((lowbits(5) & c) << 6) + (lowbits(6) & c2);
    257      // no upper bounds-check needed, by previous lines
    258      if (rv >= 0x80) {
    259        return rv;
    260      }
    261      throw NS_ERROR_ILLEGAL_VALUE;
    262    }
    263 
    264    c3 = str.read8();
    265    if (c3 >= 0xc0 || c3 < 0x80) {
    266      throw NS_ERROR_ILLEGAL_VALUE;
    267    } // not 10xxxxxx
    268 
    269    if (c < 0xf0) {
    270      // c < 11110000
    271      // three-byte between U+000800 and U+00FFFF
    272      rv =
    273        ((lowbits(4) & c) << 12) + ((lowbits(6) & c2) << 6) + (lowbits(6) & c3);
    274      // no upper bounds-check needed, by previous lines
    275      if (rv >= 0xe000 || (rv >= 0x800 && rv <= 0xd7ff)) {
    276        return rv;
    277      }
    278      throw NS_ERROR_ILLEGAL_VALUE;
    279    }
    280 
    281    c4 = str.read8();
    282    if (c4 >= 0xc0 || c4 < 0x80) {
    283      throw NS_ERROR_ILLEGAL_VALUE;
    284    } // not 10xxxxxx
    285 
    286    if (c < 0xf8) {
    287      // c < 11111000
    288      // four-byte between U+010000 and U+10FFFF
    289      rv =
    290        ((lowbits(3) & c) << 18) +
    291        ((lowbits(6) & c2) << 12) +
    292        ((lowbits(6) & c3) << 6) +
    293        (lowbits(6) & c4);
    294      // need an upper bounds-check since 0x10FFFF isn't (2**n - 1)
    295      if (rv >= 0x10000 && rv <= 0x10ffff) {
    296        return rv;
    297      }
    298      throw NS_ERROR_ILLEGAL_VALUE;
    299    }
    300 
    301    // 11111000 or greater -- no UTF-8 mapping
    302    throw NS_ERROR_ILLEGAL_VALUE;
    303  },
    304 };
    305 
    306 /**
    307 * Wraps a UTF-16 stream to allow access to the Unicode code points in it.
    308 *
    309 * @param stream
    310 *   the stream to wrap
    311 * @param bigEndian
    312 *   true for UTF-16BE, false for UTF-16LE, not present at all for UTF-16 with
    313 *   a byte-order mark
    314 */
    315 function UTF16(stream, bigEndian) {
    316  this._stream = new BIS(stream);
    317  if (arguments.length > 1) {
    318    this._bigEndian = bigEndian;
    319  } else {
    320    var bom = this._stream.read16();
    321    if (bom == 0xfeff) {
    322      this._bigEndian = true;
    323    } else if (bom == 0xfffe) {
    324      this._bigEndian = false;
    325    } else {
    326      do_throw("missing BOM: " + bom.toString(16).toUpperCase());
    327    }
    328  }
    329 }
    330 UTF16.prototype = {
    331  // returns numeric code point at front of stream encoded in UTF-16,
    332  // -1 if at end of stream, or throws if UTF-16 code point not found
    333  readUnit() {
    334    var str = this._stream;
    335 
    336    // if at end of stream, must distinguish failure to read any bytes
    337    // (correct behavior) from failure to read some byte after the first
    338    // in the character
    339    try {
    340      var b1 = str.read8();
    341    } catch (e) {
    342      return -1;
    343    }
    344 
    345    var b2 = str.read8();
    346 
    347    var w1 = this._bigEndian ? (b1 << 8) + b2 : (b2 << 8) + b1;
    348 
    349    if (w1 > 0xdbff && w1 < 0xe000) {
    350      // second surrogate, but expecting none or first
    351      throw NS_ERROR_ILLEGAL_VALUE;
    352    }
    353 
    354    if (w1 > 0xd7ff && w1 < 0xdc00) {
    355      // non-BMP, use surrogate pair
    356      b1 = str.read8();
    357      b2 = str.read8();
    358      var w2 = this._bigEndian ? (b1 << 8) + b2 : (b2 << 8) + b1;
    359      if (w2 < 0xdc00 || w2 > 0xdfff) {
    360        throw NS_ERROR_ILLEGAL_VALUE;
    361      }
    362 
    363      var rv = 0x100000 + ((lowbits(10) & w2) << 10) + (lowbits(10) & w1);
    364      if (rv <= 0x10ffff) {
    365        return rv;
    366      }
    367      throw NS_ERROR_ILLEGAL_VALUE;
    368    }
    369 
    370    // non-surrogate
    371    return w1;
    372  },
    373 };