test_idn_urls.js (11135B)
1 // Test algorithm for unicode display of IDNA URL (bug 722299) 2 3 "use strict"; 4 5 const testcases = [ 6 // Original Punycode or Expected UTF-8 7 // URL normalized form 8 9 // Latin script 10 ["cuillère", "xn--cuillre-6xa", true], 11 12 // repeated non-spacing marks 13 ["gruz̀̀ere", "xn--gruzere-ogea", false], 14 15 // non-XID character 16 ["I♥NY", "xn--iny-zx5a", false], 17 18 /* 19 Behaviour of this test changed in IDNA2008, replacing the non-XID 20 character with U+FFFD replacement character - when all platforms use 21 IDNA2008 it can be uncommented and the punycode URL changed to 22 "xn--mgbl3eb85703a" 23 24 // new non-XID character in Unicode 6.3 25 ["حلا\u061cل", "xn--bgbvr6gc", false], 26 */ 27 28 // U+30FB KATAKANA MIDDLE DOT is excluded from non-XID characters (bug 857490) 29 ["乾燥肌・石けん", "xn--08j4gylj12hz80b0uhfup", true], 30 31 // Cyrillic alone 32 ["толсто́й", "xn--lsa83dealbred", true], 33 34 // Mixed script Cyrillic/Latin 35 ["толсто́й-in-Russian", "xn---in-russian-1jg071b0a8bb4cpd", false], 36 37 // Mixed script Latin/Cyrillic 38 ["war-and-миръ", "xn--war-and--b9g3b7b3h", false], 39 40 // Cherokee (Restricted script) 41 ["ᏣᎳᎩ", "xn--f9dt7l", false], 42 43 // Yi (former Aspirational script, now Restricted per Unicode 10.0 update to UAX 31) 44 ["ꆈꌠꁱꂷ", "xn--4o7a6e1x64c", false], 45 46 // Greek alone 47 ["πλάτων", "xn--hxa3ahjw4a", true], 48 49 // Mixed script Greek/Latin 50 ["πλάτωνicrelationship", "xn--icrelationship-96j4t9a3cwe2e", false], 51 52 // Mixed script Latin/Greek 53 ["spaceὈδύσσεια", "xn--space-h9dui0b0ga2j1562b", false], 54 55 // Devanagari alone 56 ["मराठी", "xn--d2b1ag0dl", true], 57 58 // Devanagari with Armenian 59 ["मराठीՀայաստան", "xn--y9aaa1d0ai1cq964f8dwa2o1a", false], 60 61 // Devanagari with common 62 ["मराठी123", "xn--123-mhh3em2hra", true], 63 64 // Common with Devanagari 65 ["123मराठी", "xn--123-phh3em2hra", true], 66 67 // Latin with Han 68 ["chairman毛", "xn--chairman-k65r", true], 69 70 // Han with Latin 71 ["山葵sauce", "xn--sauce-6j9ii40v", true], 72 73 // Latin with Han, Hiragana and Katakana 74 ["van語ではドイ", "xn--van-ub4bpb6w0in486d", true], 75 76 // Latin with Han, Katakana and Hiragana 77 ["van語ドイでは", "xn--van-ub4bpb4w0ip486d", true], 78 79 // Latin with Hiragana, Han and Katakana 80 ["vanでは語ドイ", "xn--van-ub4bpb6w0ip486d", true], 81 82 // Latin with Hiragana, Katakana and Han 83 ["vanではドイ語", "xn--van-ub4bpb6w0ir486d", true], 84 85 // Latin with Katakana, Han and Hiragana 86 ["vanドイ語では", "xn--van-ub4bpb4w0ir486d", true], 87 88 // Latin with Katakana, Hiragana and Han 89 ["vanドイでは語", "xn--van-ub4bpb4w0it486d", true], 90 91 // Han with Latin, Hiragana and Katakana 92 ["語vanではドイ", "xn--van-ub4bpb6w0ik486d", true], 93 94 // Han with Latin, Katakana and Hiragana 95 ["語vanドイでは", "xn--van-ub4bpb4w0im486d", true], 96 97 // Han with Hiragana, Latin and Katakana 98 ["語ではvanドイ", "xn--van-rb4bpb9w0ik486d", true], 99 100 // Han with Hiragana, Katakana and Latin 101 ["語ではドイvan", "xn--van-rb4bpb6w0in486d", true], 102 103 // Han with Katakana, Latin and Hiragana 104 ["語ドイvanでは", "xn--van-ub4bpb1w0ip486d", true], 105 106 // Han with Katakana, Hiragana and Latin 107 ["語ドイではvan", "xn--van-rb4bpb4w0ip486d", true], 108 109 // Hiragana with Latin, Han and Katakana 110 ["イツvan語ではド", "xn--van-ub4bpb1wvhsbx330n", true], 111 112 // Hiragana with Latin, Katakana and Han 113 ["ではvanドイ語", "xn--van-rb4bpb9w0ir486d", true], 114 115 // Hiragana with Han, Latin and Katakana 116 ["では語vanドイ", "xn--van-rb4bpb9w0im486d", true], 117 118 // Hiragana with Han, Katakana and Latin 119 ["では語ドイvan", "xn--van-rb4bpb6w0ip486d", true], 120 121 // Hiragana with Katakana, Latin and Han 122 ["ではドイvan語", "xn--van-rb4bpb6w0iu486d", true], 123 124 // Hiragana with Katakana, Han and Latin 125 ["ではドイ語van", "xn--van-rb4bpb6w0ir486d", true], 126 127 // Katakana with Latin, Han and Hiragana 128 ["ドイvan語では", "xn--van-ub4bpb1w0iu486d", true], 129 130 // Katakana with Latin, Hiragana and Han 131 ["ドイvanでは語", "xn--van-ub4bpb1w0iw486d", true], 132 133 // Katakana with Han, Latin and Hiragana 134 ["ドイ語vanでは", "xn--van-ub4bpb1w0ir486d", true], 135 136 // Katakana with Han, Hiragana and Latin 137 ["ドイ語ではvan", "xn--van-rb4bpb4w0ir486d", true], 138 139 // Katakana with Hiragana, Latin and Han 140 ["ドイではvan語", "xn--van-rb4bpb4w0iw486d", true], 141 142 // Katakana with Hiragana, Han and Latin 143 ["ドイでは語van", "xn--van-rb4bpb4w0it486d", true], 144 145 // Han with common 146 ["中国123", "xn--123-u68dy61b", true], 147 148 // common with Han 149 ["123中国", "xn--123-x68dy61b", true], 150 151 // Characters that normalize to permitted characters 152 // (also tests Plane 1 supplementary characters) 153 ["super𝟖", "super8", true], 154 155 // Han from Plane 2 156 ["𠀀𠀁𠀂", "xn--j50icd", false], 157 ["𠜎𠜱𠝹", "xn--4m2igcqk", true], 158 159 // Han from Plane 2 with js (UTF-16) escapes 160 ["\uD840\uDC00\uD840\uDC01\uD840\uDC02", "xn--j50icd", false], 161 ["\uD841\uDF0E\uD841\uDF31\uD841\uDF79", "xn--4m2igcqk", true], 162 163 // Same with a lone high surrogate at the end 164 // Throws due to unpaired surrogate 165 // ["\uD840\uDC00\uD840\uDC01\uD840", "xn--zn7c0336bda", false], 166 167 // Latin text and Bengali digits 168 ["super৪", "xn--super-k2l", false], 169 170 // Bengali digits and Latin text 171 ["৫ab", "xn--ab-x5f", false], 172 173 // Bengali text and Latin digits 174 ["অঙ্কুর8", "xn--8-70d2cp0j6dtd", true], 175 176 // Latin digits and Bengali text 177 ["5াব", "xn--5-h3d7c", true], 178 179 // Mixed numbering systems 180 // Throws due to bidi rule violation 181 // ["٢٠۰٠", "xn--8hbae38c", false], 182 183 // Traditional Chinese 184 ["萬城", "xn--uis754h", true], 185 186 // Simplified Chinese 187 ["万城", "xn--chq31v", true], 188 189 // Simplified-only and Traditional-only Chinese in the same label 190 ["万萬城", "xn--chq31vsl1b", true], 191 192 // Traditional-only and Simplified-only Chinese in the same label 193 ["萬万城", "xn--chq31vrl1b", true], 194 195 // Han and Latin and Bopomofo 196 ["注音符号bopomofoㄅㄆㄇㄈ", "xn--bopomofo-hj5gkalm1637i876cuw0brk5f", false], 197 198 // Han, bopomofo, Latin 199 // Bug 1885096: Since the last character of "ㄅㄆㄇㄈ" is a CJK Ideograph, 200 // just use the first character "ㄅ" from the sequence "ㄅㄆㄇㄈ". 201 ["注音符号ㄅbopomofo", "xn--bopomofo-8i5gx891aylvccz9asi4e", false], 202 203 // Latin, Han, Bopomofo 204 ["bopomofo注音符号ㄅㄆㄇㄈ", "xn--bopomofo-hj5gkalm9637i876cuw0brk5f", false], 205 206 // Latin, Bopomofo, Han 207 ["bopomofoㄅㄆㄇㄈ注音符号", "xn--bopomofo-hj5gkalm3737i876cuw0brk5f", false], 208 209 // Bopomofo, Han, Latin 210 ["ㄅㄆㄇㄈ注音符号bopomofo", "xn--bopomofo-8i5gkalm3737i876cuw0brk5f", false], 211 212 // Bopomofo, Latin, Han 213 // Bug 1885096: Since the last character of "ㄅㄆㄇㄈ" is a CJK Ideograph, 214 // just use the first character "ㄅ" from the sequence "ㄅㄆㄇㄈ". 215 ["ㄅbopomofo注音符号", "xn--bopomofo-8i5g6891aylvccz9asi4e", false], 216 217 // Han, bopomofo and katakana 218 ["注音符号ㄅㄆㄇㄈボポモフォ", "xn--jckteuaez1shij0450gylvccz9asi4e", false], 219 220 // Han, katakana, bopomofo 221 ["注音符号ボポモフォㄅㄆㄇㄈ", "xn--jckteuaez6shij5350gylvccz9asi4e", false], 222 223 // bopomofo, han, katakana 224 ["ㄅㄆㄇㄈ注音符号ボポモフォ", "xn--jckteuaez1shij4450gylvccz9asi4e", false], 225 226 // bopomofo, katakana, han 227 ["ㄅㄆㄇㄈボポモフォ注音符号", "xn--jckteuaez1shij9450gylvccz9asi4e", false], 228 229 // katakana, Han, bopomofo 230 ["ボポモフォ注音符号ㄅㄆㄇㄈ", "xn--jckteuaez6shij0450gylvccz9asi4e", false], 231 232 // katakana, bopomofo, Han 233 ["ボポモフォㄅㄆㄇㄈ注音符号", "xn--jckteuaez6shij4450gylvccz9asi4e", false], 234 235 // Han, Hangul and Latin 236 ["韓한글hangul", "xn--hangul-2m5ti09k79ze", true], 237 238 // Han, Latin and Hangul 239 ["韓hangul한글", "xn--hangul-2m5to09k79ze", true], 240 241 // Hangul, Han and Latin 242 ["한글韓hangul", "xn--hangul-2m5th09k79ze", true], 243 244 // Hangul, Latin and Han 245 ["한글hangul韓", "xn--hangul-8m5t898k79ze", true], 246 247 // Latin, Han and Hangul 248 ["hangul韓한글", "xn--hangul-8m5ti09k79ze", true], 249 250 // Latin, Hangul and Han 251 ["hangul한글韓", "xn--hangul-8m5th09k79ze", true], 252 253 // Hangul and katakana 254 ["한글ハングル", "xn--qck1c2d4a9266lkmzb", false], 255 256 // Katakana and Hangul 257 ["ハングル한글", "xn--qck1c2d4a2366lkmzb", false], 258 259 // Thai (also tests that node with over 63 UTF-8 octets doesn't fail) 260 [ 261 "เครื่องทําน้ําทําน้ําแข็ง", 262 "xn--22cdjb2fanb9fyepcbbb9dwh4a3igze4fdcd", 263 true, 264 ], 265 266 // Effect of adding valid or invalid subdomains (bug 1399540) 267 ["曹曳曷曽.ascii", "xn--movies.ascii", true], 268 ["ascii.曹曳曷曽", "ascii.xn--movies", true], 269 ["中国123.曹曳曷曽", "xn--123-u68dy61b.xn--movies", true], 270 ["曹曳曷曽.中国123", "xn--movies.xn--123-u68dy61b", true], 271 // Throw due to bogus Punycode 272 // [ 273 // "xn--accountlogin.䕮䕵䕶䕱", 274 // "xn--accountlogin.xn--google", 275 // true, 276 // ], 277 // [ 278 // "䕮䕵䕶䕱.xn--accountlogin", 279 // "xn--google.xn--accountlogin", 280 // true, 281 // ], 282 283 // Arabic diacritic not allowed in Latin text (bug 1370497) 284 ["goo\u0650gle", "xn--google-yri", false], 285 // ...but Arabic diacritics are allowed on Arabic text 286 ["العَرَبِي", "xn--mgbc0a5a6cxbzabt", true], 287 288 // Hebrew diacritic also not allowed in Latin text (bug 1404349) 289 ["goo\u05b4gle", "xn--google-rvh", false], 290 291 // Accents above dotless-i are not allowed 292 ["na\u0131\u0308ve", "xn--nave-mza04z", false], 293 ["d\u0131\u0302ner", "xn--dner-lza40z", false], 294 // but the corresponding accented-i (based on dotted i) is OK 295 ["na\u00efve.com", "xn--nave-6pa.com", true], 296 ["d\u00eener.com", "xn--dner-0pa.com", true], 297 ]; 298 299 function run_test() { 300 var idnService = Cc["@mozilla.org/network/idn-service;1"].getService( 301 Ci.nsIIDNService 302 ); 303 304 for (var j = 0; j < testcases.length; ++j) { 305 var test = testcases[j]; 306 var URL = test[0] + ".com"; 307 var punycodeURL = test[1] + ".com"; 308 var expectedUnicode = test[2]; 309 310 var result; 311 try { 312 result = idnService.convertToDisplayIDN(URL); 313 } catch (e) { 314 result = ".com"; 315 } 316 if ( 317 punycodeURL.substr(0, 4) == "xn--" || 318 punycodeURL.indexOf(".xn--") > 0 319 ) { 320 // test convertToDisplayIDN with a Unicode URL and with a 321 // Punycode URL if we have one 322 Assert.equal( 323 escape(result), 324 expectedUnicode ? escape(URL) : escape(punycodeURL) 325 ); 326 327 result = idnService.convertToDisplayIDN(punycodeURL); 328 Assert.equal( 329 escape(result), 330 expectedUnicode ? escape(URL) : escape(punycodeURL) 331 ); 332 } else { 333 // The "punycode" URL isn't punycode. This happens in testcases 334 // where the Unicode URL has become normalized to an ASCII URL, 335 // so, even though expectedUnicode is true, the expected result 336 // is equal to punycodeURL 337 Assert.equal(escape(result), escape(punycodeURL)); 338 } 339 } 340 }