test_charset_conversion.js (9232B)
1 const NS_ERROR_ILLEGAL_VALUE = Cr.NS_ERROR_ILLEGAL_VALUE; 2 3 var BIS, BOS, _Pipe, COS, FIS, _SS, CIS; 4 5 var dataDir; 6 7 function run_test() { 8 BIS = Components.Constructor( 9 "@mozilla.org/binaryinputstream;1", 10 "nsIBinaryInputStream", 11 "setInputStream" 12 ); 13 BOS = Components.Constructor( 14 "@mozilla.org/binaryoutputstream;1", 15 "nsIBinaryOutputStream", 16 "setOutputStream" 17 ); 18 _Pipe = Components.Constructor("@mozilla.org/pipe;1", "nsIPipe", "init"); 19 COS = Components.Constructor( 20 "@mozilla.org/intl/converter-output-stream;1", 21 "nsIConverterOutputStream", 22 "init" 23 ); 24 FIS = Components.Constructor( 25 "@mozilla.org/network/file-input-stream;1", 26 "nsIFileInputStream", 27 "init" 28 ); 29 _SS = Components.Constructor( 30 "@mozilla.org/storagestream;1", 31 "nsIStorageStream", 32 "init" 33 ); 34 CIS = Components.Constructor( 35 "@mozilla.org/intl/converter-input-stream;1", 36 "nsIConverterInputStream", 37 "init" 38 ); 39 40 dataDir = do_get_file("data/"); 41 42 test_utf8_1(); 43 test_cross_conversion(); 44 } 45 46 const UNICODE_STRINGS = [ 47 "\u00BD + \u00BE == \u00BD\u00B2 + \u00BC + \u00BE", 48 49 "AZaz09 \u007F " + // U+000000 to U+00007F 50 "\u0080 \u0398 \u03BB \u0725 " + // U+000080 to U+0007FF 51 "\u0964 \u0F5F \u20AC \uFFFB", // U+000800 to U+00FFFF 52 53 // there would be strings containing non-BMP code points here, but 54 // unfortunately JS strings are UCS-2 (and worse yet are treated as 55 // 16-bit values by the spec), so we have to do gymnastics to work 56 // with non-BMP -- manual surrogate decoding doesn't work because 57 // String.prototype.charCodeAt() ignores surrogate pairs and only 58 // returns 16-bit values 59 ]; 60 61 // test conversion equality -- keys are names of files containing equivalent 62 // Unicode data, values are the encoding of the file in the format expected by 63 // nsIConverter(In|Out)putStream.init 64 const UNICODE_FILES = { 65 "unicode-conversion.utf8.txt": "UTF-8", 66 "unicode-conversion.utf16.txt": "UTF-16", 67 "unicode-conversion.utf16le.txt": "UTF-16LE", 68 "unicode-conversion.utf16be.txt": "UTF-16BE", 69 }; 70 71 function test_utf8_1() { 72 for (var i = 0; i < UNICODE_STRINGS.length; i++) { 73 var pipe = Pipe(); 74 var conv = new COS(pipe.outputStream, "UTF-8"); 75 Assert.ok(conv.writeString(UNICODE_STRINGS[i])); 76 conv.close(); 77 78 if ( 79 !equalStreams( 80 new UTF8(pipe.inputStream), 81 stringToCodePoints(UNICODE_STRINGS[i]) 82 ) 83 ) { 84 do_throw("UNICODE_STRINGS[" + i + "] not handled correctly"); 85 } 86 } 87 } 88 89 function test_cross_conversion() { 90 for (var fn1 in UNICODE_FILES) { 91 var fin = getBinaryInputStream(fn1); 92 var ss = StorageStream(); 93 94 var bos = new BOS(ss.getOutputStream(0)); 95 var av; 96 while ((av = fin.available()) > 0) { 97 var data = fin.readByteArray(av); 98 bos.writeByteArray(data); 99 } 100 fin.close(); 101 bos.close(); 102 103 for (var fn2 in UNICODE_FILES) { 104 var fin2 = getUnicharInputStream(fn2, UNICODE_FILES[fn2]); 105 var unichar = new CIS( 106 ss.newInputStream(0), 107 UNICODE_FILES[fn1], 108 8192, 109 0x0 110 ); 111 112 if (!equalUnicharStreams(unichar, fin2)) { 113 do_throw( 114 "unequal streams: " + UNICODE_FILES[fn1] + ", " + UNICODE_FILES[fn2] 115 ); 116 } 117 } 118 } 119 } 120 121 // utility functions 122 123 function StorageStream() { 124 return new _SS(8192, Math.pow(2, 32) - 1, null); 125 } 126 127 function getUnicharInputStream(filename, encoding) { 128 var file = dataDir.clone(); 129 file.append(filename); 130 131 const PR_RDONLY = 0x1; 132 var fis = new FIS( 133 file, 134 PR_RDONLY, 135 "0644", 136 Ci.nsIFileInputStream.CLOSE_ON_EOF 137 ); 138 return new CIS(fis, encoding, 8192, 0x0); 139 } 140 141 function getBinaryInputStream(filename) { 142 var file = dataDir.clone(); 143 file.append(filename); 144 145 const PR_RDONLY = 0x1; 146 var fis = new FIS( 147 file, 148 PR_RDONLY, 149 "0644", 150 Ci.nsIFileInputStream.CLOSE_ON_EOF 151 ); 152 return new BIS(fis); 153 } 154 155 function equalStreams(stream, codePoints) { 156 var currIndex = 0; 157 while (true) { 158 var unit = stream.readUnit(); 159 if (unit < 0) { 160 return currIndex == codePoints.length; 161 } 162 if (unit !== codePoints[currIndex++]) { 163 return false; 164 } 165 } 166 // eslint-disable-next-line no-unreachable 167 do_throw("not reached"); 168 return false; 169 } 170 171 function equalUnicharStreams(s1, s2) { 172 var r1, r2; 173 var str1 = {}, 174 str2 = {}; 175 while (true) { 176 r1 = s1.readString(1024, str1); 177 r2 = s2.readString(1024, str2); 178 179 if (r1 != r2 || str1.value != str2.value) { 180 print("r1: " + r1 + ", r2: " + r2); 181 print(str1.value.length); 182 print(str2.value.length); 183 return false; 184 } 185 if (r1 == 0 && r2 == 0) { 186 return true; 187 } 188 } 189 190 // not reached 191 // eslint-disable-next-line no-unreachable 192 return false; 193 } 194 195 function stringToCodePoints(str) { 196 return str.split("").map(function (v) { 197 return v.charCodeAt(0); 198 }); 199 } 200 201 function lowbits(n) { 202 return Math.pow(2, n) - 1; 203 } 204 205 function Pipe() { 206 return new _Pipe(false, false, 1024, 10, null); 207 } 208 209 // complex charset readers 210 211 /** 212 * Wraps a UTF-8 stream to allow access to the Unicode code points in it. 213 * 214 * @param stream 215 * the stream to wrap 216 */ 217 function UTF8(stream) { 218 this._stream = new BIS(stream); 219 } 220 UTF8.prototype = { 221 // returns numeric code point at front of stream encoded in UTF-8, -1 if at 222 // end of stream, or throws if valid (and properly encoded!) code point not 223 // found 224 readUnit() { 225 var str = this._stream; 226 227 var c, c2, c3, c4, rv; 228 229 // if at end of stream, must distinguish failure to read any bytes 230 // (correct behavior) from failure to read some byte after the first 231 // in the character 232 try { 233 c = str.read8(); 234 } catch (e) { 235 return -1; 236 } 237 238 if (c < 0x80) { 239 return c; 240 } 241 242 if (c < 0xc0) { 243 // c < 11000000 244 // byte doesn't have enough leading ones (must be at least two) 245 throw NS_ERROR_ILLEGAL_VALUE; 246 } 247 248 c2 = str.read8(); 249 if (c2 >= 0xc0 || c2 < 0x80) { 250 throw NS_ERROR_ILLEGAL_VALUE; 251 } // not 10xxxxxx 252 253 if (c < 0xe0) { 254 // c < 11100000 255 // two-byte between U+000080 and U+0007FF 256 rv = ((lowbits(5) & c) << 6) + (lowbits(6) & c2); 257 // no upper bounds-check needed, by previous lines 258 if (rv >= 0x80) { 259 return rv; 260 } 261 throw NS_ERROR_ILLEGAL_VALUE; 262 } 263 264 c3 = str.read8(); 265 if (c3 >= 0xc0 || c3 < 0x80) { 266 throw NS_ERROR_ILLEGAL_VALUE; 267 } // not 10xxxxxx 268 269 if (c < 0xf0) { 270 // c < 11110000 271 // three-byte between U+000800 and U+00FFFF 272 rv = 273 ((lowbits(4) & c) << 12) + ((lowbits(6) & c2) << 6) + (lowbits(6) & c3); 274 // no upper bounds-check needed, by previous lines 275 if (rv >= 0xe000 || (rv >= 0x800 && rv <= 0xd7ff)) { 276 return rv; 277 } 278 throw NS_ERROR_ILLEGAL_VALUE; 279 } 280 281 c4 = str.read8(); 282 if (c4 >= 0xc0 || c4 < 0x80) { 283 throw NS_ERROR_ILLEGAL_VALUE; 284 } // not 10xxxxxx 285 286 if (c < 0xf8) { 287 // c < 11111000 288 // four-byte between U+010000 and U+10FFFF 289 rv = 290 ((lowbits(3) & c) << 18) + 291 ((lowbits(6) & c2) << 12) + 292 ((lowbits(6) & c3) << 6) + 293 (lowbits(6) & c4); 294 // need an upper bounds-check since 0x10FFFF isn't (2**n - 1) 295 if (rv >= 0x10000 && rv <= 0x10ffff) { 296 return rv; 297 } 298 throw NS_ERROR_ILLEGAL_VALUE; 299 } 300 301 // 11111000 or greater -- no UTF-8 mapping 302 throw NS_ERROR_ILLEGAL_VALUE; 303 }, 304 }; 305 306 /** 307 * Wraps a UTF-16 stream to allow access to the Unicode code points in it. 308 * 309 * @param stream 310 * the stream to wrap 311 * @param bigEndian 312 * true for UTF-16BE, false for UTF-16LE, not present at all for UTF-16 with 313 * a byte-order mark 314 */ 315 function UTF16(stream, bigEndian) { 316 this._stream = new BIS(stream); 317 if (arguments.length > 1) { 318 this._bigEndian = bigEndian; 319 } else { 320 var bom = this._stream.read16(); 321 if (bom == 0xfeff) { 322 this._bigEndian = true; 323 } else if (bom == 0xfffe) { 324 this._bigEndian = false; 325 } else { 326 do_throw("missing BOM: " + bom.toString(16).toUpperCase()); 327 } 328 } 329 } 330 UTF16.prototype = { 331 // returns numeric code point at front of stream encoded in UTF-16, 332 // -1 if at end of stream, or throws if UTF-16 code point not found 333 readUnit() { 334 var str = this._stream; 335 336 // if at end of stream, must distinguish failure to read any bytes 337 // (correct behavior) from failure to read some byte after the first 338 // in the character 339 try { 340 var b1 = str.read8(); 341 } catch (e) { 342 return -1; 343 } 344 345 var b2 = str.read8(); 346 347 var w1 = this._bigEndian ? (b1 << 8) + b2 : (b2 << 8) + b1; 348 349 if (w1 > 0xdbff && w1 < 0xe000) { 350 // second surrogate, but expecting none or first 351 throw NS_ERROR_ILLEGAL_VALUE; 352 } 353 354 if (w1 > 0xd7ff && w1 < 0xdc00) { 355 // non-BMP, use surrogate pair 356 b1 = str.read8(); 357 b2 = str.read8(); 358 var w2 = this._bigEndian ? (b1 << 8) + b2 : (b2 << 8) + b1; 359 if (w2 < 0xdc00 || w2 > 0xdfff) { 360 throw NS_ERROR_ILLEGAL_VALUE; 361 } 362 363 var rv = 0x100000 + ((lowbits(10) & w2) << 10) + (lowbits(10) & w1); 364 if (rv <= 0x10ffff) { 365 return rv; 366 } 367 throw NS_ERROR_ILLEGAL_VALUE; 368 } 369 370 // non-surrogate 371 return w1; 372 }, 373 };