test_utf.js (6229B)
1 // NOTE: Requires testharness.js 2 // http://www.w3.org/2008/webapps/wiki/Harness 3 4 // Extension to testharness.js API which avoids logging enormous strings 5 // on a coding failure. 6 function assert_string_equals(actual, expected, description) { 7 // short circuit success case 8 if (actual === expected) { 9 assert_true(true, description + ": <actual> === <expected>"); 10 return; 11 } 12 13 // length check 14 assert_equals( 15 actual.length, 16 expected.length, 17 description + ": string lengths" 18 ); 19 20 var i, a, b; 21 for (i = 0; i < actual.length; i++) { 22 a = actual.charCodeAt(i); 23 b = expected.charCodeAt(i); 24 if (a !== b) { 25 assert_true( 26 false, 27 description + 28 ": code unit " + 29 i.toString() + 30 " unequal: " + 31 cpname(a) + 32 " != " + 33 cpname(b) 34 ); 35 } // doesn't return 36 } 37 38 // It should be impossible to get here, because the initial 39 // comparison failed, so either the length comparison or the 40 // codeunit-by-codeunit comparison should also fail. 41 assert_true(false, description + ": failed to detect string difference"); 42 } 43 44 // Inspired by: 45 // http://ecmanaut.blogspot.com/2006/07/encoding-decoding-utf8-in-javascript.html 46 function encode_utf8(string) { 47 var utf8 = unescape(encodeURIComponent(string)); 48 var octets = new Uint8Array(utf8.length), 49 i; 50 for (i = 0; i < utf8.length; i += 1) { 51 octets[i] = utf8.charCodeAt(i); 52 } 53 return octets; 54 } 55 56 function encode_utf16le(string) { 57 var octets = new Uint8Array(string.length * 2); 58 var di = 0; 59 for (var i = 0; i < string.length; i++) { 60 var code = string.charCodeAt(i); 61 octets[di++] = code & 0xff; 62 octets[di++] = code >> 8; 63 } 64 return octets; 65 } 66 67 function encode_utf16be(string) { 68 var octets = new Uint8Array(string.length * 2); 69 var di = 0; 70 for (var i = 0; i < string.length; i++) { 71 var code = string.charCodeAt(i); 72 octets[di++] = code >> 8; 73 octets[di++] = code & 0xff; 74 } 75 return octets; 76 } 77 78 function decode_utf8(octets) { 79 var utf8 = String.fromCharCode.apply(null, octets); 80 return decodeURIComponent(escape(utf8)); 81 } 82 83 // Helpers for test_utf_roundtrip. 84 function cpname(n) { 85 if (n + 0 !== n) { 86 return n.toString(); 87 } 88 var w = n <= 0xffff ? 4 : 6; 89 return "U+" + ("000000" + n.toString(16).toUpperCase()).slice(-w); 90 } 91 92 function genblock(from, len) { 93 var i, j, point, offset; 94 var size, block; 95 96 // determine size required: 97 // 1 unit for each point from U+000000 through U+00D7FF 98 // 0 units U+00D800 through U+00DFFF 99 // 1 unit U+00E000 through U+00FFFF 100 // 2 units U+010000 through U+10FFFF 101 function overlap(min1, max1, min2, max2) { 102 return Math.max(0, Math.min(max1, max2) - Math.max(min1, min2)); 103 } 104 size = 105 overlap(from, from + len, 0x000000, 0x00d800) + 106 overlap(from, from + len, 0x00e000, 0x010000) + 107 overlap(from, from + len, 0x010000, 0x110000) * 2; 108 109 block = new Uint16Array(size); 110 for (i = 0, j = 0; i < len; i++) { 111 point = from + i; 112 if (0xd800 <= point && point <= 0xdfff) { 113 continue; 114 } else if (point <= 0xffff) { 115 block[j++] = point; 116 } else { 117 offset = point - 0x10000; 118 block[j++] = 0xd800 + (offset >> 10); 119 block[j++] = 0xdc00 + (offset & 0x3ff); 120 } 121 } 122 return String.fromCharCode.apply(null, block); 123 } 124 125 function test_utf_roundtrip() { 126 var MIN_CODEPOINT = 0; 127 var MAX_CODEPOINT = 0x10ffff; 128 var BLOCK_SIZE = 0x1000; 129 130 var block, block_tag, i, encoded, decoded, exp_encoded, exp_decoded; 131 132 var TD_U16LE = new TextDecoder("UTF-16LE"); 133 134 var TD_U16BE = new TextDecoder("UTF-16BE"); 135 136 var TE_U8 = new TextEncoder(); 137 var TD_U8 = new TextDecoder("UTF-8"); 138 139 for (i = MIN_CODEPOINT; i < MAX_CODEPOINT; i += BLOCK_SIZE) { 140 block_tag = cpname(i) + " - " + cpname(i + BLOCK_SIZE - 1); 141 block = genblock(i, BLOCK_SIZE); 142 143 // test UTF-16LE, UTF-16BE, and UTF-8 encodings against themselves 144 encoded = encode_utf16le(block); 145 decoded = TD_U16LE.decode(encoded); 146 assert_string_equals(block, decoded, "UTF-16LE round trip " + block_tag); 147 148 encoded = encode_utf16be(block); 149 decoded = TD_U16BE.decode(encoded); 150 assert_string_equals(block, decoded, "UTF-16BE round trip " + block_tag); 151 152 encoded = TE_U8.encode(block); 153 decoded = TD_U8.decode(encoded); 154 assert_string_equals(block, decoded, "UTF-8 round trip " + block_tag); 155 156 // test TextEncoder(UTF-8) against the older idiom 157 exp_encoded = encode_utf8(block); 158 assert_array_equals( 159 encoded, 160 exp_encoded, 161 "UTF-8 reference encoding " + block_tag 162 ); 163 164 exp_decoded = decode_utf8(exp_encoded); 165 assert_string_equals( 166 decoded, 167 exp_decoded, 168 "UTF-8 reference decoding " + block_tag 169 ); 170 } 171 } 172 173 function test_utf_samples() { 174 // z, cent, CJK water, G-Clef, Private-use character 175 var sample = "z\xA2\u6C34\uD834\uDD1E\uDBFF\uDFFD"; 176 var cases = [ 177 { 178 encoding: "utf-8", 179 expected: [ 180 0x7a, 0xc2, 0xa2, 0xe6, 0xb0, 0xb4, 0xf0, 0x9d, 0x84, 0x9e, 0xf4, 0x8f, 181 0xbf, 0xbd, 182 ], 183 }, 184 { 185 encoding: "utf-16le", 186 expected: [ 187 0x7a, 0x00, 0xa2, 0x00, 0x34, 0x6c, 0x34, 0xd8, 0x1e, 0xdd, 0xff, 0xdb, 188 0xfd, 0xdf, 189 ], 190 }, 191 { 192 encoding: "utf-16", 193 expected: [ 194 0x7a, 0x00, 0xa2, 0x00, 0x34, 0x6c, 0x34, 0xd8, 0x1e, 0xdd, 0xff, 0xdb, 195 0xfd, 0xdf, 196 ], 197 }, 198 { 199 encoding: "utf-16be", 200 expected: [ 201 0x00, 0x7a, 0x00, 0xa2, 0x6c, 0x34, 0xd8, 0x34, 0xdd, 0x1e, 0xdb, 0xff, 202 0xdf, 0xfd, 203 ], 204 }, 205 ]; 206 207 var encoded = new TextEncoder().encode(sample); 208 assert_array_equals(encoded, cases[0].expected, "expected equal encodings"); 209 210 cases.forEach(function (t) { 211 var decoded = new TextDecoder(t.encoding).decode( 212 new Uint8Array(t.expected) 213 ); 214 assert_equals(decoded, sample, "expected equal decodings - " + t.encoding); 215 }); 216 } 217 218 test( 219 test_utf_samples, 220 "UTF-8, UTF-16LE, UTF-16BE - Encode/Decode - reference sample" 221 ); 222 223 test( 224 test_utf_roundtrip, 225 "UTF-8, UTF-16LE, UTF-16BE - Encode/Decode - full roundtrip and " + 226 "agreement with encode/decodeURIComponent" 227 );