tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

testUTF8.cpp (6422B)


      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
      2 * vim: set ts=8 sts=2 et sw=2 tw=80:
      3 */
      4 /* This Source Code Form is subject to the terms of the Mozilla Public
      5 * License, v. 2.0. If a copy of the MPL was not distributed with this
      6 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      7 
      8 #include "mozilla/Range.h"  // mozilla::Range
      9 #include "mozilla/Span.h"   // mozilla::Span
     10 #include "mozilla/Utf8.h"   // mozilla::ConvertUtf8toUtf16
     11 
     12 #include "js/CharacterEncoding.h"
     13 #include "jsapi-tests/tests.h"
     14 
     15 BEGIN_TEST(testUTF8_badUTF8) {
     16  static const char badUTF8[] = "...\xC0...";
     17  JSString* str = JS_NewStringCopyZ(cx, badUTF8);
     18  CHECK(str);
     19  char16_t ch;
     20  if (!JS_GetStringCharAt(cx, str, 3, &ch)) {
     21    return false;
     22  }
     23  CHECK(ch == 0x00C0);
     24  return true;
     25 }
     26 END_TEST(testUTF8_badUTF8)
     27 
     28 BEGIN_TEST(testUTF8_bigUTF8) {
     29  static const char bigUTF8[] = "...\xFB\xBF\xBF\xBF\xBF...";
     30  JSString* str = JS_NewStringCopyZ(cx, bigUTF8);
     31  CHECK(str);
     32  char16_t ch;
     33  if (!JS_GetStringCharAt(cx, str, 3, &ch)) {
     34    return false;
     35  }
     36  CHECK(ch == 0x00FB);
     37  return true;
     38 }
     39 END_TEST(testUTF8_bigUTF8)
     40 
     41 BEGIN_TEST(testUTF8_badSurrogate) {
     42  static const char16_t badSurrogate[] = {'A', 'B', 'C', 0xDEEE, 'D', 'E', 0};
     43  mozilla::Range<const char16_t> tbchars(badSurrogate, js_strlen(badSurrogate));
     44  JS::Latin1CharsZ latin1 = JS::LossyTwoByteCharsToNewLatin1CharsZ(cx, tbchars);
     45  CHECK(latin1);
     46  CHECK(latin1[3] == 0x00EE);
     47  return true;
     48 }
     49 END_TEST(testUTF8_badSurrogate)
     50 
     51 BEGIN_TEST(testUTF8_LossyConversion) {
     52  // Maximal subparts of an ill-formed subsequence should be replaced with
     53  // single REPLACEMENT CHARACTER.
     54 
     55  // Input ends with partial sequence.
     56  // clang-format off
     57  const char* inputs1[] = {
     58    "\xC2",
     59    "\xDF",
     60    "\xE0",
     61    "\xE0\xA0",
     62    "\xF0",
     63    "\xF0\x90",
     64    "\xF0\x90\x80",
     65  };
     66  // clang-format on
     67 
     68  char16_t outputBuf[8];
     69  mozilla::Span output(outputBuf, 8);
     70 
     71  for (const char* input : inputs1) {
     72    size_t len;
     73    JS::TwoByteCharsZ utf16 = JS::LossyUTF8CharsToNewTwoByteCharsZ(
     74        cx, JS::UTF8Chars(input, js_strlen(input)), &len,
     75        js::StringBufferArena);
     76    CHECK(utf16);
     77    CHECK(len == 1);
     78    CHECK(utf16[0] == 0xFFFD);
     79 
     80    // Make sure the behavior matches to encoding_rs.
     81    len = mozilla::ConvertUtf8toUtf16(mozilla::Span(input, js_strlen(input)),
     82                                      output);
     83    CHECK(len == 1);
     84    CHECK(outputBuf[0] == 0xFFFD);
     85  }
     86 
     87  // Partial sequence followed by ASCII range.
     88  // clang-format off
     89  const char* inputs2[] = {
     90    "\xC2 ",
     91    "\xDF ",
     92    "\xE0 ",
     93    "\xE0\xA0 ",
     94    "\xF0 ",
     95    "\xF0\x90 ",
     96    "\xF0\x90\x80 ",
     97  };
     98  // clang-format on
     99 
    100  for (const char* input : inputs2) {
    101    size_t len;
    102    JS::TwoByteCharsZ utf16 = JS::LossyUTF8CharsToNewTwoByteCharsZ(
    103        cx, JS::UTF8Chars(input, js_strlen(input)), &len,
    104        js::StringBufferArena);
    105    CHECK(utf16);
    106    CHECK(len == 2);
    107    CHECK(utf16[0] == 0xFFFD);
    108    CHECK(utf16[1] == 0x20);
    109 
    110    len = mozilla::ConvertUtf8toUtf16(mozilla::Span(input, js_strlen(input)),
    111                                      output);
    112    CHECK(len == 2);
    113    CHECK(outputBuf[0] == 0xFFFD);
    114    CHECK(outputBuf[1] == 0x20);
    115  }
    116 
    117  // Partial sequence followed by other first code unit.
    118  // clang-format off
    119  const char* inputs3[] = {
    120    "\xC2\xC2\x80",
    121    "\xDF\xC2\x80",
    122    "\xE0\xC2\x80",
    123    "\xE0\xA0\xC2\x80",
    124    "\xF0\xC2\x80",
    125    "\xF0\x90\xC2\x80",
    126    "\xF0\x90\x80\xC2\x80",
    127  };
    128  // clang-format on
    129 
    130  for (const char* input : inputs3) {
    131    size_t len;
    132    JS::TwoByteCharsZ utf16 = JS::LossyUTF8CharsToNewTwoByteCharsZ(
    133        cx, JS::UTF8Chars(input, js_strlen(input)), &len,
    134        js::StringBufferArena);
    135    CHECK(utf16);
    136    CHECK(len == 2);
    137    CHECK(utf16[0] == 0xFFFD);
    138    CHECK(utf16[1] == 0x80);
    139 
    140    len = mozilla::ConvertUtf8toUtf16(mozilla::Span(input, js_strlen(input)),
    141                                      output);
    142    CHECK(len == 2);
    143    CHECK(outputBuf[0] == 0xFFFD);
    144    CHECK(outputBuf[1] == 0x80);
    145  }
    146 
    147  // Invalid second byte.
    148  // clang-format off
    149  const char* inputs4[] = {
    150    "\xE0\x9F\x80\x80",
    151    "\xED\xA0\x80\x80",
    152    "\xF0\x80\x80\x80",
    153    "\xF4\x90\x80\x80",
    154  };
    155  // clang-format on
    156 
    157  for (const char* input : inputs4) {
    158    size_t len;
    159    JS::TwoByteCharsZ utf16 = JS::LossyUTF8CharsToNewTwoByteCharsZ(
    160        cx, JS::UTF8Chars(input, js_strlen(input)), &len,
    161        js::StringBufferArena);
    162    CHECK(utf16);
    163    CHECK(len == 4);
    164    CHECK(utf16[0] == 0xFFFD);
    165    CHECK(utf16[1] == 0xFFFD);
    166    CHECK(utf16[2] == 0xFFFD);
    167    CHECK(utf16[3] == 0xFFFD);
    168 
    169    len = mozilla::ConvertUtf8toUtf16(mozilla::Span(input, js_strlen(input)),
    170                                      output);
    171    CHECK(len == 4);
    172    CHECK(outputBuf[0] == 0xFFFD);
    173    CHECK(outputBuf[1] == 0xFFFD);
    174    CHECK(outputBuf[2] == 0xFFFD);
    175    CHECK(outputBuf[3] == 0xFFFD);
    176  }
    177 
    178  // Invalid second byte, with not sufficient number of units.
    179  // clang-format off
    180  const char* inputs5[] = {
    181    "\xE0\x9F\x80",
    182    "\xED\xA0\x80",
    183    "\xF0\x80\x80",
    184    "\xF4\x90\x80",
    185  };
    186  const char* inputs6[] = {
    187    "\xE0\x9F",
    188    "\xED\xA0",
    189    "\xF0\x80",
    190    "\xF4\x90",
    191  };
    192  // clang-format on
    193 
    194  for (const char* input : inputs5) {
    195    size_t len;
    196    JS::TwoByteCharsZ utf16 = JS::LossyUTF8CharsToNewTwoByteCharsZ(
    197        cx, JS::UTF8Chars(input, js_strlen(input)), &len,
    198        js::StringBufferArena);
    199    CHECK(utf16);
    200    CHECK(len == 3);
    201    CHECK(utf16[0] == 0xFFFD);
    202    CHECK(utf16[1] == 0xFFFD);
    203    CHECK(utf16[2] == 0xFFFD);
    204 
    205    len = mozilla::ConvertUtf8toUtf16(mozilla::Span(input, js_strlen(input)),
    206                                      output);
    207    CHECK(len == 3);
    208    CHECK(outputBuf[0] == 0xFFFD);
    209    CHECK(outputBuf[1] == 0xFFFD);
    210    CHECK(outputBuf[2] == 0xFFFD);
    211  }
    212 
    213  for (const char* input : inputs6) {
    214    size_t len;
    215    JS::TwoByteCharsZ utf16 = JS::LossyUTF8CharsToNewTwoByteCharsZ(
    216        cx, JS::UTF8Chars(input, js_strlen(input)), &len,
    217        js::StringBufferArena);
    218    CHECK(utf16);
    219    CHECK(len == 2);
    220    CHECK(utf16[0] == 0xFFFD);
    221    CHECK(utf16[1] == 0xFFFD);
    222 
    223    len = mozilla::ConvertUtf8toUtf16(mozilla::Span(input, js_strlen(input)),
    224                                      output);
    225    CHECK(len == 2);
    226    CHECK(outputBuf[0] == 0xFFFD);
    227    CHECK(outputBuf[1] == 0xFFFD);
    228  }
    229  return true;
    230 }
    231 END_TEST(testUTF8_LossyConversion)