testUTF8.cpp (6422B)
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 * vim: set ts=8 sts=2 et sw=2 tw=80: 3 */ 4 /* This Source Code Form is subject to the terms of the Mozilla Public 5 * License, v. 2.0. If a copy of the MPL was not distributed with this 6 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 7 8 #include "mozilla/Range.h" // mozilla::Range 9 #include "mozilla/Span.h" // mozilla::Span 10 #include "mozilla/Utf8.h" // mozilla::ConvertUtf8toUtf16 11 12 #include "js/CharacterEncoding.h" 13 #include "jsapi-tests/tests.h" 14 15 BEGIN_TEST(testUTF8_badUTF8) { 16 static const char badUTF8[] = "...\xC0..."; 17 JSString* str = JS_NewStringCopyZ(cx, badUTF8); 18 CHECK(str); 19 char16_t ch; 20 if (!JS_GetStringCharAt(cx, str, 3, &ch)) { 21 return false; 22 } 23 CHECK(ch == 0x00C0); 24 return true; 25 } 26 END_TEST(testUTF8_badUTF8) 27 28 BEGIN_TEST(testUTF8_bigUTF8) { 29 static const char bigUTF8[] = "...\xFB\xBF\xBF\xBF\xBF..."; 30 JSString* str = JS_NewStringCopyZ(cx, bigUTF8); 31 CHECK(str); 32 char16_t ch; 33 if (!JS_GetStringCharAt(cx, str, 3, &ch)) { 34 return false; 35 } 36 CHECK(ch == 0x00FB); 37 return true; 38 } 39 END_TEST(testUTF8_bigUTF8) 40 41 BEGIN_TEST(testUTF8_badSurrogate) { 42 static const char16_t badSurrogate[] = {'A', 'B', 'C', 0xDEEE, 'D', 'E', 0}; 43 mozilla::Range<const char16_t> tbchars(badSurrogate, js_strlen(badSurrogate)); 44 JS::Latin1CharsZ latin1 = JS::LossyTwoByteCharsToNewLatin1CharsZ(cx, tbchars); 45 CHECK(latin1); 46 CHECK(latin1[3] == 0x00EE); 47 return true; 48 } 49 END_TEST(testUTF8_badSurrogate) 50 51 BEGIN_TEST(testUTF8_LossyConversion) { 52 // Maximal subparts of an ill-formed subsequence should be replaced with 53 // single REPLACEMENT CHARACTER. 54 55 // Input ends with partial sequence. 56 // clang-format off 57 const char* inputs1[] = { 58 "\xC2", 59 "\xDF", 60 "\xE0", 61 "\xE0\xA0", 62 "\xF0", 63 "\xF0\x90", 64 "\xF0\x90\x80", 65 }; 66 // clang-format on 67 68 char16_t outputBuf[8]; 69 mozilla::Span output(outputBuf, 8); 70 71 for (const char* input : inputs1) { 72 size_t len; 73 JS::TwoByteCharsZ utf16 = JS::LossyUTF8CharsToNewTwoByteCharsZ( 74 cx, JS::UTF8Chars(input, js_strlen(input)), &len, 75 js::StringBufferArena); 76 CHECK(utf16); 77 CHECK(len == 1); 78 CHECK(utf16[0] == 0xFFFD); 79 80 // Make sure the behavior matches to encoding_rs. 81 len = mozilla::ConvertUtf8toUtf16(mozilla::Span(input, js_strlen(input)), 82 output); 83 CHECK(len == 1); 84 CHECK(outputBuf[0] == 0xFFFD); 85 } 86 87 // Partial sequence followed by ASCII range. 88 // clang-format off 89 const char* inputs2[] = { 90 "\xC2 ", 91 "\xDF ", 92 "\xE0 ", 93 "\xE0\xA0 ", 94 "\xF0 ", 95 "\xF0\x90 ", 96 "\xF0\x90\x80 ", 97 }; 98 // clang-format on 99 100 for (const char* input : inputs2) { 101 size_t len; 102 JS::TwoByteCharsZ utf16 = JS::LossyUTF8CharsToNewTwoByteCharsZ( 103 cx, JS::UTF8Chars(input, js_strlen(input)), &len, 104 js::StringBufferArena); 105 CHECK(utf16); 106 CHECK(len == 2); 107 CHECK(utf16[0] == 0xFFFD); 108 CHECK(utf16[1] == 0x20); 109 110 len = mozilla::ConvertUtf8toUtf16(mozilla::Span(input, js_strlen(input)), 111 output); 112 CHECK(len == 2); 113 CHECK(outputBuf[0] == 0xFFFD); 114 CHECK(outputBuf[1] == 0x20); 115 } 116 117 // Partial sequence followed by other first code unit. 118 // clang-format off 119 const char* inputs3[] = { 120 "\xC2\xC2\x80", 121 "\xDF\xC2\x80", 122 "\xE0\xC2\x80", 123 "\xE0\xA0\xC2\x80", 124 "\xF0\xC2\x80", 125 "\xF0\x90\xC2\x80", 126 "\xF0\x90\x80\xC2\x80", 127 }; 128 // clang-format on 129 130 for (const char* input : inputs3) { 131 size_t len; 132 JS::TwoByteCharsZ utf16 = JS::LossyUTF8CharsToNewTwoByteCharsZ( 133 cx, JS::UTF8Chars(input, js_strlen(input)), &len, 134 js::StringBufferArena); 135 CHECK(utf16); 136 CHECK(len == 2); 137 CHECK(utf16[0] == 0xFFFD); 138 CHECK(utf16[1] == 0x80); 139 140 len = mozilla::ConvertUtf8toUtf16(mozilla::Span(input, js_strlen(input)), 141 output); 142 CHECK(len == 2); 143 CHECK(outputBuf[0] == 0xFFFD); 144 CHECK(outputBuf[1] == 0x80); 145 } 146 147 // Invalid second byte. 148 // clang-format off 149 const char* inputs4[] = { 150 "\xE0\x9F\x80\x80", 151 "\xED\xA0\x80\x80", 152 "\xF0\x80\x80\x80", 153 "\xF4\x90\x80\x80", 154 }; 155 // clang-format on 156 157 for (const char* input : inputs4) { 158 size_t len; 159 JS::TwoByteCharsZ utf16 = JS::LossyUTF8CharsToNewTwoByteCharsZ( 160 cx, JS::UTF8Chars(input, js_strlen(input)), &len, 161 js::StringBufferArena); 162 CHECK(utf16); 163 CHECK(len == 4); 164 CHECK(utf16[0] == 0xFFFD); 165 CHECK(utf16[1] == 0xFFFD); 166 CHECK(utf16[2] == 0xFFFD); 167 CHECK(utf16[3] == 0xFFFD); 168 169 len = mozilla::ConvertUtf8toUtf16(mozilla::Span(input, js_strlen(input)), 170 output); 171 CHECK(len == 4); 172 CHECK(outputBuf[0] == 0xFFFD); 173 CHECK(outputBuf[1] == 0xFFFD); 174 CHECK(outputBuf[2] == 0xFFFD); 175 CHECK(outputBuf[3] == 0xFFFD); 176 } 177 178 // Invalid second byte, with not sufficient number of units. 179 // clang-format off 180 const char* inputs5[] = { 181 "\xE0\x9F\x80", 182 "\xED\xA0\x80", 183 "\xF0\x80\x80", 184 "\xF4\x90\x80", 185 }; 186 const char* inputs6[] = { 187 "\xE0\x9F", 188 "\xED\xA0", 189 "\xF0\x80", 190 "\xF4\x90", 191 }; 192 // clang-format on 193 194 for (const char* input : inputs5) { 195 size_t len; 196 JS::TwoByteCharsZ utf16 = JS::LossyUTF8CharsToNewTwoByteCharsZ( 197 cx, JS::UTF8Chars(input, js_strlen(input)), &len, 198 js::StringBufferArena); 199 CHECK(utf16); 200 CHECK(len == 3); 201 CHECK(utf16[0] == 0xFFFD); 202 CHECK(utf16[1] == 0xFFFD); 203 CHECK(utf16[2] == 0xFFFD); 204 205 len = mozilla::ConvertUtf8toUtf16(mozilla::Span(input, js_strlen(input)), 206 output); 207 CHECK(len == 3); 208 CHECK(outputBuf[0] == 0xFFFD); 209 CHECK(outputBuf[1] == 0xFFFD); 210 CHECK(outputBuf[2] == 0xFFFD); 211 } 212 213 for (const char* input : inputs6) { 214 size_t len; 215 JS::TwoByteCharsZ utf16 = JS::LossyUTF8CharsToNewTwoByteCharsZ( 216 cx, JS::UTF8Chars(input, js_strlen(input)), &len, 217 js::StringBufferArena); 218 CHECK(utf16); 219 CHECK(len == 2); 220 CHECK(utf16[0] == 0xFFFD); 221 CHECK(utf16[1] == 0xFFFD); 222 223 len = mozilla::ConvertUtf8toUtf16(mozilla::Span(input, js_strlen(input)), 224 output); 225 CHECK(len == 2); 226 CHECK(outputBuf[0] == 0xFFFD); 227 CHECK(outputBuf[1] == 0xFFFD); 228 } 229 return true; 230 } 231 END_TEST(testUTF8_LossyConversion)