tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

test_idn_urls.js (11135B)


      1 // Test algorithm for unicode display of IDNA URL (bug 722299)
      2 
      3 "use strict";
      4 
      5 const testcases = [
      6  //  Original             Punycode or         Expected UTF-8
      7  //    URL              normalized form
      8 
      9  // Latin script
     10  ["cuillère", "xn--cuillre-6xa", true],
     11 
     12  // repeated non-spacing marks
     13  ["gruz̀̀ere", "xn--gruzere-ogea", false],
     14 
     15  // non-XID character
     16  ["I♥NY", "xn--iny-zx5a", false],
     17 
     18  /*
     19  Behaviour of this test changed in IDNA2008, replacing the non-XID
     20  character with U+FFFD replacement character - when all platforms use
     21  IDNA2008 it can be uncommented and the punycode URL changed to
     22   "xn--mgbl3eb85703a"
     23 
     24    // new non-XID character in Unicode 6.3
     25    ["حلا\u061cل", "xn--bgbvr6gc",                    false],
     26 */
     27 
     28  // U+30FB KATAKANA MIDDLE DOT is excluded from non-XID characters (bug 857490)
     29  ["乾燥肌・石けん", "xn--08j4gylj12hz80b0uhfup", true],
     30 
     31  // Cyrillic alone
     32  ["толсто́й", "xn--lsa83dealbred", true],
     33 
     34  // Mixed script Cyrillic/Latin
     35  ["толсто́й-in-Russian", "xn---in-russian-1jg071b0a8bb4cpd", false],
     36 
     37  // Mixed script Latin/Cyrillic
     38  ["war-and-миръ", "xn--war-and--b9g3b7b3h", false],
     39 
     40  // Cherokee (Restricted script)
     41  ["ᏣᎳᎩ", "xn--f9dt7l", false],
     42 
     43  // Yi (former Aspirational script, now Restricted per Unicode 10.0 update to UAX 31)
     44  ["ꆈꌠꁱꂷ", "xn--4o7a6e1x64c", false],
     45 
     46  // Greek alone
     47  ["πλάτων", "xn--hxa3ahjw4a", true],
     48 
     49  // Mixed script Greek/Latin
     50  ["πλάτωνicrelationship", "xn--icrelationship-96j4t9a3cwe2e", false],
     51 
     52  // Mixed script Latin/Greek
     53  ["spaceὈδύσσεια", "xn--space-h9dui0b0ga2j1562b", false],
     54 
     55  // Devanagari alone
     56  ["मराठी", "xn--d2b1ag0dl", true],
     57 
     58  // Devanagari with Armenian
     59  ["मराठीՀայաստան", "xn--y9aaa1d0ai1cq964f8dwa2o1a", false],
     60 
     61  // Devanagari with common
     62  ["मराठी123", "xn--123-mhh3em2hra", true],
     63 
     64  // Common with Devanagari
     65  ["123मराठी", "xn--123-phh3em2hra", true],
     66 
     67  // Latin with Han
     68  ["chairman毛", "xn--chairman-k65r", true],
     69 
     70  // Han with Latin
     71  ["山葵sauce", "xn--sauce-6j9ii40v", true],
     72 
     73  // Latin with Han, Hiragana and Katakana
     74  ["van語ではドイ", "xn--van-ub4bpb6w0in486d", true],
     75 
     76  // Latin with Han, Katakana and Hiragana
     77  ["van語ドイでは", "xn--van-ub4bpb4w0ip486d", true],
     78 
     79  // Latin with Hiragana, Han and Katakana
     80  ["vanでは語ドイ", "xn--van-ub4bpb6w0ip486d", true],
     81 
     82  // Latin with Hiragana, Katakana and Han
     83  ["vanではドイ語", "xn--van-ub4bpb6w0ir486d", true],
     84 
     85  // Latin with Katakana, Han and Hiragana
     86  ["vanドイ語では", "xn--van-ub4bpb4w0ir486d", true],
     87 
     88  // Latin with Katakana, Hiragana and Han
     89  ["vanドイでは語", "xn--van-ub4bpb4w0it486d", true],
     90 
     91  // Han with Latin, Hiragana and Katakana
     92  ["語vanではドイ", "xn--van-ub4bpb6w0ik486d", true],
     93 
     94  // Han with Latin, Katakana and Hiragana
     95  ["語vanドイでは", "xn--van-ub4bpb4w0im486d", true],
     96 
     97  // Han with Hiragana, Latin and Katakana
     98  ["語ではvanドイ", "xn--van-rb4bpb9w0ik486d", true],
     99 
    100  // Han with Hiragana, Katakana and Latin
    101  ["語ではドイvan", "xn--van-rb4bpb6w0in486d", true],
    102 
    103  // Han with Katakana, Latin and Hiragana
    104  ["語ドイvanでは", "xn--van-ub4bpb1w0ip486d", true],
    105 
    106  // Han with Katakana, Hiragana and Latin
    107  ["語ドイではvan", "xn--van-rb4bpb4w0ip486d", true],
    108 
    109  // Hiragana with Latin, Han and Katakana
    110  ["イツvan語ではド", "xn--van-ub4bpb1wvhsbx330n", true],
    111 
    112  // Hiragana with Latin, Katakana and Han
    113  ["ではvanドイ語", "xn--van-rb4bpb9w0ir486d", true],
    114 
    115  // Hiragana with Han, Latin and Katakana
    116  ["では語vanドイ", "xn--van-rb4bpb9w0im486d", true],
    117 
    118  // Hiragana with Han, Katakana and Latin
    119  ["では語ドイvan", "xn--van-rb4bpb6w0ip486d", true],
    120 
    121  // Hiragana with Katakana, Latin and Han
    122  ["ではドイvan語", "xn--van-rb4bpb6w0iu486d", true],
    123 
    124  // Hiragana with Katakana, Han and Latin
    125  ["ではドイ語van", "xn--van-rb4bpb6w0ir486d", true],
    126 
    127  // Katakana with Latin, Han and Hiragana
    128  ["ドイvan語では", "xn--van-ub4bpb1w0iu486d", true],
    129 
    130  // Katakana with Latin, Hiragana and Han
    131  ["ドイvanでは語", "xn--van-ub4bpb1w0iw486d", true],
    132 
    133  // Katakana with Han, Latin and Hiragana
    134  ["ドイ語vanでは", "xn--van-ub4bpb1w0ir486d", true],
    135 
    136  // Katakana with Han, Hiragana and Latin
    137  ["ドイ語ではvan", "xn--van-rb4bpb4w0ir486d", true],
    138 
    139  // Katakana with Hiragana, Latin and Han
    140  ["ドイではvan語", "xn--van-rb4bpb4w0iw486d", true],
    141 
    142  // Katakana with Hiragana, Han and Latin
    143  ["ドイでは語van", "xn--van-rb4bpb4w0it486d", true],
    144 
    145  // Han with common
    146  ["中国123", "xn--123-u68dy61b", true],
    147 
    148  // common with Han
    149  ["123中国", "xn--123-x68dy61b", true],
    150 
    151  // Characters that normalize to permitted characters
    152  //  (also tests Plane 1 supplementary characters)
    153  ["super𝟖", "super8", true],
    154 
    155  // Han from Plane 2
    156  ["𠀀𠀁𠀂", "xn--j50icd", false],
    157  ["𠜎𠜱𠝹", "xn--4m2igcqk", true],
    158 
    159  // Han from Plane 2 with js (UTF-16) escapes
    160  ["\uD840\uDC00\uD840\uDC01\uD840\uDC02", "xn--j50icd", false],
    161  ["\uD841\uDF0E\uD841\uDF31\uD841\uDF79", "xn--4m2igcqk", true],
    162 
    163  // Same with a lone high surrogate at the end
    164  // Throws due to unpaired surrogate
    165  //  ["\uD840\uDC00\uD840\uDC01\uD840", "xn--zn7c0336bda", false],
    166 
    167  // Latin text and Bengali digits
    168  ["super৪", "xn--super-k2l", false],
    169 
    170  // Bengali digits and Latin text
    171  ["৫ab", "xn--ab-x5f", false],
    172 
    173  // Bengali text and Latin digits
    174  ["অঙ্কুর8", "xn--8-70d2cp0j6dtd", true],
    175 
    176  // Latin digits and Bengali text
    177  ["5াব", "xn--5-h3d7c", true],
    178 
    179  // Mixed numbering systems
    180  // Throws due to bidi rule violation
    181  // ["٢٠۰٠", "xn--8hbae38c", false],
    182 
    183  // Traditional Chinese
    184  ["萬城", "xn--uis754h", true],
    185 
    186  // Simplified Chinese
    187  ["万城", "xn--chq31v", true],
    188 
    189  // Simplified-only and Traditional-only Chinese in the same label
    190  ["万萬城", "xn--chq31vsl1b", true],
    191 
    192  // Traditional-only and Simplified-only Chinese in the same label
    193  ["萬万城", "xn--chq31vrl1b", true],
    194 
    195  // Han and Latin and Bopomofo
    196  ["注音符号bopomofoㄅㄆㄇㄈ", "xn--bopomofo-hj5gkalm1637i876cuw0brk5f", false],
    197 
    198  // Han, bopomofo, Latin
    199  // Bug 1885096: Since the last character of "ㄅㄆㄇㄈ" is a CJK Ideograph,
    200  // just use the first character "ㄅ" from the sequence "ㄅㄆㄇㄈ".
    201  ["注音符号ㄅbopomofo", "xn--bopomofo-8i5gx891aylvccz9asi4e", false],
    202 
    203  // Latin, Han, Bopomofo
    204  ["bopomofo注音符号ㄅㄆㄇㄈ", "xn--bopomofo-hj5gkalm9637i876cuw0brk5f", false],
    205 
    206  // Latin, Bopomofo, Han
    207  ["bopomofoㄅㄆㄇㄈ注音符号", "xn--bopomofo-hj5gkalm3737i876cuw0brk5f", false],
    208 
    209  // Bopomofo, Han, Latin
    210  ["ㄅㄆㄇㄈ注音符号bopomofo", "xn--bopomofo-8i5gkalm3737i876cuw0brk5f", false],
    211 
    212  // Bopomofo, Latin, Han
    213  // Bug 1885096: Since the last character of "ㄅㄆㄇㄈ" is a CJK Ideograph,
    214  // just use the first character "ㄅ" from the sequence "ㄅㄆㄇㄈ".
    215  ["ㄅbopomofo注音符号", "xn--bopomofo-8i5g6891aylvccz9asi4e", false],
    216 
    217  // Han, bopomofo and katakana
    218  ["注音符号ㄅㄆㄇㄈボポモフォ", "xn--jckteuaez1shij0450gylvccz9asi4e", false],
    219 
    220  // Han, katakana, bopomofo
    221  ["注音符号ボポモフォㄅㄆㄇㄈ", "xn--jckteuaez6shij5350gylvccz9asi4e", false],
    222 
    223  // bopomofo, han, katakana
    224  ["ㄅㄆㄇㄈ注音符号ボポモフォ", "xn--jckteuaez1shij4450gylvccz9asi4e", false],
    225 
    226  // bopomofo, katakana, han
    227  ["ㄅㄆㄇㄈボポモフォ注音符号", "xn--jckteuaez1shij9450gylvccz9asi4e", false],
    228 
    229  // katakana, Han, bopomofo
    230  ["ボポモフォ注音符号ㄅㄆㄇㄈ", "xn--jckteuaez6shij0450gylvccz9asi4e", false],
    231 
    232  // katakana, bopomofo, Han
    233  ["ボポモフォㄅㄆㄇㄈ注音符号", "xn--jckteuaez6shij4450gylvccz9asi4e", false],
    234 
    235  // Han, Hangul and Latin
    236  ["韓한글hangul", "xn--hangul-2m5ti09k79ze", true],
    237 
    238  // Han, Latin and Hangul
    239  ["韓hangul한글", "xn--hangul-2m5to09k79ze", true],
    240 
    241  // Hangul, Han and Latin
    242  ["한글韓hangul", "xn--hangul-2m5th09k79ze", true],
    243 
    244  // Hangul, Latin and Han
    245  ["한글hangul韓", "xn--hangul-8m5t898k79ze", true],
    246 
    247  // Latin, Han and Hangul
    248  ["hangul韓한글", "xn--hangul-8m5ti09k79ze", true],
    249 
    250  // Latin, Hangul and Han
    251  ["hangul한글韓", "xn--hangul-8m5th09k79ze", true],
    252 
    253  // Hangul and katakana
    254  ["한글ハングル", "xn--qck1c2d4a9266lkmzb", false],
    255 
    256  // Katakana and Hangul
    257  ["ハングル한글", "xn--qck1c2d4a2366lkmzb", false],
    258 
    259  // Thai (also tests that node with over 63 UTF-8 octets doesn't fail)
    260  [
    261    "เครื่องทําน้ําทําน้ําแข็ง",
    262    "xn--22cdjb2fanb9fyepcbbb9dwh4a3igze4fdcd",
    263    true,
    264  ],
    265 
    266  // Effect of adding valid or invalid subdomains (bug 1399540)
    267  ["曹曳曷曽.ascii", "xn--movies.ascii", true],
    268  ["ascii.曹曳曷曽", "ascii.xn--movies", true],
    269  ["中国123.曹曳曷曽", "xn--123-u68dy61b.xn--movies", true],
    270  ["曹曳曷曽.中国123", "xn--movies.xn--123-u68dy61b", true],
    271  // Throw due to bogus Punycode
    272  // [
    273  //   "xn--accountlogin.䕮䕵䕶䕱",
    274  //   "xn--accountlogin.xn--google",
    275  //   true,
    276  // ],
    277  // [
    278  //   "䕮䕵䕶䕱.xn--accountlogin",
    279  //   "xn--google.xn--accountlogin",
    280  //   true,
    281  // ],
    282 
    283  // Arabic diacritic not allowed in Latin text (bug 1370497)
    284  ["goo\u0650gle", "xn--google-yri", false],
    285  // ...but Arabic diacritics are allowed on Arabic text
    286  ["العَرَبِي", "xn--mgbc0a5a6cxbzabt", true],
    287 
    288  // Hebrew diacritic also not allowed in Latin text (bug 1404349)
    289  ["goo\u05b4gle", "xn--google-rvh", false],
    290 
    291  // Accents above dotless-i are not allowed
    292  ["na\u0131\u0308ve", "xn--nave-mza04z", false],
    293  ["d\u0131\u0302ner", "xn--dner-lza40z", false],
    294  // but the corresponding accented-i (based on dotted i) is OK
    295  ["na\u00efve.com", "xn--nave-6pa.com", true],
    296  ["d\u00eener.com", "xn--dner-0pa.com", true],
    297 ];
    298 
    299 function run_test() {
    300  var idnService = Cc["@mozilla.org/network/idn-service;1"].getService(
    301    Ci.nsIIDNService
    302  );
    303 
    304  for (var j = 0; j < testcases.length; ++j) {
    305    var test = testcases[j];
    306    var URL = test[0] + ".com";
    307    var punycodeURL = test[1] + ".com";
    308    var expectedUnicode = test[2];
    309 
    310    var result;
    311    try {
    312      result = idnService.convertToDisplayIDN(URL);
    313    } catch (e) {
    314      result = ".com";
    315    }
    316    if (
    317      punycodeURL.substr(0, 4) == "xn--" ||
    318      punycodeURL.indexOf(".xn--") > 0
    319    ) {
    320      // test convertToDisplayIDN with a Unicode URL and with a
    321      //  Punycode URL if we have one
    322      Assert.equal(
    323        escape(result),
    324        expectedUnicode ? escape(URL) : escape(punycodeURL)
    325      );
    326 
    327      result = idnService.convertToDisplayIDN(punycodeURL);
    328      Assert.equal(
    329        escape(result),
    330        expectedUnicode ? escape(URL) : escape(punycodeURL)
    331      );
    332    } else {
    333      // The "punycode" URL isn't punycode. This happens in testcases
    334      // where the Unicode URL has become normalized to an ASCII URL,
    335      // so, even though expectedUnicode is true, the expected result
    336      // is equal to punycodeURL
    337      Assert.equal(escape(result), escape(punycodeURL));
    338    }
    339  }
    340 }